feat(lora): add LoRA support for SGLang (#4769)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat(lora): add LoRA support for SGLang (#4769)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
f9839161 · Biswa Panda · GitHub · 21b44473 · f9839161 · f9839161
Unverified Commit f9839161 authored Apr 14, 2026 by Biswa Panda Committed by GitHub Apr 14, 2026
14 changed files
--- a/components/src/dynamo/sglang/init_llm.py
+++ b/components/src/dynamo/sglang/init_llm.py
@@ -80,6 +80,15 @@ async def init_decode(
    generate_endpoint = runtime.endpoint(
        f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
    )
+    load_lora_endpoint = runtime.endpoint(
+        f"{dynamo_args.namespace}.{dynamo_args.component}.load_lora"
+    )
+    unload_lora_endpoint = runtime.endpoint(
+        f"{dynamo_args.namespace}.{dynamo_args.component}.unload_lora"
+    )
+    list_loras_endpoint = runtime.endpoint(
+        f"{dynamo_args.namespace}.{dynamo_args.component}.list_loras"
+    )

    shutdown_endpoints[:] = [generate_endpoint]

@@ -132,6 +141,18 @@ async def init_decode(
                metrics_labels=metrics_labels,
                health_check_payload=health_check_payload,
            ),
+            load_lora_endpoint.serve_endpoint(
+                handler.load_lora,
+                metrics_labels=metrics_labels,
+            ),
+            unload_lora_endpoint.serve_endpoint(
+                handler.unload_lora,
+                metrics_labels=metrics_labels,
+            ),
+            list_loras_endpoint.serve_endpoint(
+                handler.list_loras,
+                metrics_labels=metrics_labels,
+            ),
            register_model_with_readiness_gate(
                engine,
                generate_endpoint,
@@ -187,6 +208,15 @@ async def init_prefill(
    generate_endpoint = runtime.endpoint(
        f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
    )
+    load_lora_endpoint = runtime.endpoint(
+        f"{dynamo_args.namespace}.{dynamo_args.component}.load_lora"
+    )
+    unload_lora_endpoint = runtime.endpoint(
+        f"{dynamo_args.namespace}.{dynamo_args.component}.unload_lora"
+    )
+    list_loras_endpoint = runtime.endpoint(
+        f"{dynamo_args.namespace}.{dynamo_args.component}.list_loras"
+    )

    shutdown_endpoints[:] = [generate_endpoint]

@@ -228,6 +258,18 @@ async def init_prefill(
                metrics_labels=metrics_labels,
                health_check_payload=health_check_payload,
            ),
+            load_lora_endpoint.serve_endpoint(
+                handler.load_lora,
+                metrics_labels=metrics_labels,
+            ),
+            unload_lora_endpoint.serve_endpoint(
+                handler.unload_lora,
+                metrics_labels=metrics_labels,
+            ),
+            list_loras_endpoint.serve_endpoint(
+                handler.list_loras,
+                metrics_labels=metrics_labels,
+            ),
            register_model_with_readiness_gate(
                engine,
                generate_endpoint,

--- a/components/src/dynamo/sglang/request_handlers/handler_base.py
+++ b/components/src/dynamo/sglang/request_handlers/handler_base.py
--- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
@@ -272,6 +272,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        priority = (request.get("routing") or {}).get("priority")
        logprob_kwargs = self._build_logprob_kwargs(request)

+        lora_path = self._resolve_lora(request)
+        if lora_path:
+            logging.debug(f"Request {context.id()} will use LoRA adapter: {lora_path}")
+
        if self.serving_mode == DisaggregationMode.DECODE:
            # Check if bootstrap_info is pre-computed in the request (from frontend)
            bootstrap_info = request.get("bootstrap_info")
@@ -306,6 +310,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                rid=trace_id,
                data_parallel_rank=dp_rank,
                **self._session_kwargs(request),
+                lora_path=lora_path,
                **logprob_kwargs,
                **self._priority_kwargs(priority),
            )
@@ -340,6 +345,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                rid=trace_id,
                data_parallel_rank=dp_rank,
                **self._session_kwargs(request),
+                lora_path=lora_path,
                **logprob_kwargs,
                **self._priority_kwargs(priority),
            )

--- a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
@@ -147,6 +147,12 @@ class PrefillWorkerHandler(BaseWorkerHandler):

        trace_header = build_trace_headers(context) if self.enable_trace else None

+        lora_path = self._resolve_lora(inner_request)
+        if lora_path:
+            logging.debug(
+                f"Prefill request {context.id()} will use LoRA adapter: {lora_path}"
+            )
+
        results = await self.engine.async_generate(
            **input_param,
            sampling_params=sampling_params,
@@ -158,6 +164,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
            rid=trace_id,
            data_parallel_rank=dp_rank,
            **self._session_kwargs(inner_request),
+            lora_path=lora_path,
            **self._priority_kwargs(priority),
        )


--- a/examples/backends/sglang/launch/lora/README.md
+++ b/examples/backends/sglang/launch/lora/README.md
+# LoRA with SGLang Backend
+
+For the full LoRA integration guide (setup, usage, API reference, troubleshooting), see [the shared LoRA guide](../../../../common/lora.md).
+
+## Quick Start
+
+```bash
+./setup_minio.sh    # Start MinIO, download & upload LoRA
+./agg_lora.sh       # Launch SGLang frontend + worker with LoRA
+```
+
+## SGLang-Specific Notes
+
+- The launch script uses `--lora-target-modules all` and `--max-lora-rank 64` by default
+- Override with environment variables: `MODEL`, `LORA_NAME`, `DYN_SYSTEM_PORT`, `DYN_HTTP_PORT`
+- SGLang LoRA loading goes through `engine.tokenizer_manager.load_lora_adapter()`
--- a/examples/backends/sglang/launch/lora/agg_lora.sh
+++ b/examples/backends/sglang/launch/lora/agg_lora.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated serving with LoRA support (SGLang backend).
+# GPUs: 1
+# Prerequisites: ./setup_minio.sh (starts MinIO, uploads LoRA)
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+# S3/MinIO credentials
+export AWS_ENDPOINT=http://localhost:9000
+export AWS_ACCESS_KEY_ID=minioadmin
+export AWS_SECRET_ACCESS_KEY=minioadmin
+export AWS_REGION=us-east-1
+export AWS_ALLOW_HTTP=true
+
+# Dynamo LoRA configuration
+export DYN_LORA_ENABLED=true
+export DYN_LORA_PATH=/tmp/dynamo_loras_minio
+mkdir -p "$DYN_LORA_PATH"
+
+MODEL="${MODEL:-Qwen/Qwen3-0.6B}"
+LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
+SYSTEM_PORT="${DYN_SYSTEM_PORT:-8081}"
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
+# Default to profiled KV token cap when not overridden by the test scheduler
+: "${GPU_MEM_ARGS:=--max-total-tokens 2848}"
+
+print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
+echo ""
+echo "Once running, test with:"
+echo "  curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://my-loras/${LORA_NAME}\"}}' | jq ."
+echo ""
+echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
+echo "    -H 'Content-Type: application/json' \\"
+echo "    -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}], \"max_tokens\": 300}' | jq ."
+echo "=========================================="
+
+# Frontend
+python3 -m dynamo.frontend &
+
+# Worker
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
+python3 -m dynamo.sglang \
+  --model-path "$MODEL" \
+  --served-model-name "$MODEL" \
+  --page-size 16 \
+  --tp 1 \
+  --trust-remote-code \
+  --skip-tokenizer-init \
+  --enable-lora \
+  --max-lora-rank 64 \
+  --lora-target-modules all \
+  $GPU_MEM_ARGS &
+
+wait_any_exit
--- a/examples/backends/sglang/launch/lora/setup_minio.sh
+++ b/examples/backends/sglang/launch/lora/setup_minio.sh
+../../../../common/setup_minio.sh
\ No newline at end of file
--- a/examples/backends/vllm/launch/lora/README.md
+++ b/examples/backends/vllm/launch/lora/README.md
-# S3-compatible storage backend LoRA Integration Guide
+# LoRA with vLLM Backend

-This guide explains how to set up and use LoRA (Low-Rank Adaptation) adapters with Dynamo using S3-compatible storage backend (e.g. MinIO, AWS S3, GCS, etc.).
-
-## Overview
-
-This example demonstrates how to:
-1. Set up MinIO as a local S3-compatible storage
-2. Download LoRA adapters from Hugging Face Hub
-3. Upload LoRA adapters to MinIO
-4. Load and use LoRA adapters with Dynamo
-5. Run inference with LoRA-adapted models
-6. Manage (load/unload) LoRA adapters
-
-## Prerequisites
-
-### Required Software
- Docker (for running MinIO)
- Python 3.8+
- AWS CLI: `pip install awscli`
- Hugging Face CLI: `pip install huggingface-hub`
- jq (optional, for pretty JSON output): `sudo apt install jq`
-
-### Python Dependencies
-Make sure you have Dynamo installed with vLLM support:
-```bash
-pip install dynamo vllm
-```
+For the full LoRA integration guide (setup, usage, API reference, troubleshooting), see [the shared LoRA guide](../../../../common/lora.md).

 ## Quick Start

-### Step 1: Setup MinIO and Upload LoRA
-
-Run the setup script to start MinIO and download/upload a LoRA adapter from Hugging Face:
-
-```bash
-./setup_minio.sh
-```
-
-This script will:
- Start MinIO in a Docker container
- Download a LoRA adapter from Hugging Face Hub (default: `codelion/Qwen3-0.6B-accuracy-recovery-lora`)
- Upload the LoRA to MinIO at `s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora`
-
-#### Script Options
-
-The setup script supports different modes:
-
-```bash
-# Full setup (default) - start MinIO, download & upload LoRA
-./setup_minio.sh
-
-# Start MinIO only (without downloading/uploading)
-./setup_minio.sh --start
-
-# Stop MinIO
-./setup_minio.sh --stop
-
-# Show help
-./setup_minio.sh --help
-```
-
-#### Customize the LoRA to Download
-
-You can specify a different LoRA repository and name:
-
-```bash
-HF_LORA_REPO="username/lora-repo" \
-LORA_NAME="my-lora" \
-  ./setup_minio.sh
-```
-
-### Step 2: Launch Dynamo with LoRA Support
-
-Start the Dynamo frontend and worker with LoRA support enabled:
-
-```bash
-./agg_lora.sh
-```
-
-This will:
- Set up AWS credentials for MinIO
- Start the Dynamo frontend on port 8000
- Start the Dynamo worker (vLLM) on port 8081 with LoRA support
-
-Wait for the services to start (check the logs for "Application startup complete").
-
-## Working with LoRAs
-
-### 1. Check Available Models
-
-List all available models (base model only at first):
-
-```bash
-curl http://localhost:8000/v1/models | jq .
-```
-
-### 2. Load a LoRA Adapter
-
-Load a LoRA from S3-compatible storage backend (e.g. MinIO):
-
-```bash
-curl -X POST http://localhost:8081/v1/loras \
-  -H "Content-Type: application/json" \
-  -d '{
-    "lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-    "source": {
-      "uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
-    }
-  }' | jq .
-```
-
-Expected response:
-```json
-{
-  "status": "success",
-  "message": "LoRA adapter 'codelion/Qwen3-0.6B-accuracy-recovery-lora' loaded successfully",
-  "lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-  "lora_id": 1207343256
-}
-```
-
-### 3. List Loaded LoRAs
-
-Check which LoRAs are currently loaded:
-
-```bash
-curl http://localhost:8081/v1/loras | jq .
-```
-
-### 4. Verify LoRA in Models List
-
-After loading, the LoRA should appear in the models list:
-
-```bash
-curl http://localhost:8000/v1/models | jq .
-```
-
-You should see both the base model and the LoRA adapter listed.
-
-### 5. Run Inference with LoRA
-
-#### Using the LoRA-adapted model:
-
-```bash
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
-    "messages": [{
-      "role": "user",
-      "content": "What is good low risk investment strategy?"
-    }],
-    "max_tokens": 300,
-    "temperature": 0.1
-  }' | jq .
-```
-
-#### For comparison, using the base model:
-
 ```bash
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen3-0.6B",
-    "messages": [{
-      "role": "user",
-      "content": "What is good low risk investment strategy?"
-    }],
-    "max_tokens": 300
-  }' | jq .
-```
-
-### 6. Unload a LoRA
-
-When you no longer need a LoRA, unload it to free up resources:
-
-```bash
-curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora | jq .
-```
-
-Expected response:
-```json
-{
-  "status": "success",
-  "message": "LoRA unloaded successfully"
-}
+./setup_minio.sh    # Start MinIO, download & upload LoRA
+./agg_lora.sh       # Launch vLLM frontend + worker with LoRA
 ```

-After unloading, the LoRA will be removed from both `/v1/loras` and `/v1/models` endpoints.
-
-## Configuration
+## vLLM-Specific Notes

-### Environment Variables
+- Default `--max-lora-rank 64` (same as SGLang)
+- Override with environment variables: `MODEL`, `LORA_NAME`, `MAX_MODEL_LEN`, `MAX_CONCURRENT_SEQS`

-The following environment variables can be configured:
+### KV-Aware Routing (2 GPUs)

 ```bash
-# S3-compatible storage backend Configuration
-export AWS_ENDPOINT=http://localhost:9000
-export AWS_ACCESS_KEY_ID=minioadmin
-export AWS_SECRET_ACCESS_KEY=minioadmin
-export AWS_REGION=us-east-1
-
-# Dynamo LoRA Configuration
-export DYN_LORA_ENABLED=true
-export DYN_LORA_PATH=/tmp/dynamo_loras_minio
+./agg_lora_router.sh
 ```

-### MinIO Console
-
-Access the MinIO web console at http://localhost:9001
- Username: `minioadmin`
- Password: `minioadmin`
-
-## Troubleshooting
-
-### MinIO won't start
- Check if ports 9000 and 9001 are already in use
- Ensure Docker is running
- Check Docker logs: `docker logs dynamo-minio`
- Try stopping any existing MinIO containers: `./setup_minio.sh --stop`
- Restart MinIO: `./setup_minio.sh --start`
-
-### LoRA fails to load
- Verify the LoRA is uploaded to MinIO: `aws --endpoint-url=http://localhost:9000 s3 ls s3://my-loras/`
- Check AWS credentials are set correctly
- Ensure the LoRA files are compatible with the base model
- Check vLLM logs for detailed error messages
-
-### Inference fails
- Verify the model name matches exactly (case-sensitive)
- Check if the LoRA is loaded: `curl http://localhost:8081/v1/loras`
- Ensure the base model supports the LoRA rank
- Check that max_lora_rank in the worker config is >= the LoRA rank
-
-### Cache issues
- Check the cache directory: `ls -la /tmp/dynamo_loras_minio/`
- Clear the cache if needed: `rm -rf /tmp/dynamo_loras_minio/*`
- Ensure the cache directory is writable
-
-## Advanced Usage
-
-### Loading Multiple LoRAs
-
-You can load multiple LoRA adapters simultaneously:
-
-```bash
-# Load first LoRA
-curl -X POST http://localhost:8081/v1/loras \
-  -H "Content-Type: application/json" \
-  -d '{"lora_name": "lora1", "source": {"uri": "s3://my-loras/lora1"}}'
-
-# Load second LoRA
-curl -X POST http://localhost:8081/v1/loras \
-  -H "Content-Type: application/json" \
-  -d '{"lora_name": "lora2", "source": {"uri": "s3://my-loras/lora2"}}'
-```
-
-### Using Different Base Models
-
-To use a different base model, modify the `--model` parameter in `agg_lora.sh`:
-
-```bash
-python -m dynamo.vllm --model meta-llama/Llama-2-7b-hf --enable-lora --max-lora-rank 64
-```
-
-Ensure your LoRAs are compatible with the chosen base model.
-
-## Cleanup
-
-### Stop Services
-
-Press `Ctrl+C` in the terminal running `agg_lora.sh` to stop Dynamo services.
-
-### Stop MinIO
-
-```bash
-# Using the setup script (recommended)
-./setup_minio.sh --stop
-
-# Or manually with Docker
-docker stop dynamo-minio
-docker rm dynamo-minio
-```
-
-### Clean Up Data
-
-```bash
-# Remove MinIO data
-rm -rf ~/dynamo_minio_data
-
-# Remove LoRA cache
-rm -rf /tmp/dynamo_loras_minio
-```
-
-## API Reference
-
-### Load LoRA
- **Endpoint**: `POST /v1/loras`
- **Body**: `{"lora_name": "string", "source": {"uri": "string"}}`
- **Response**: `{"status": "success", "lora_id": int}`
-
-### List LoRAs
- **Endpoint**: `GET /v1/loras`
- **Response**: Array of loaded LoRAs
-
-### Unload LoRA
- **Endpoint**: `DELETE /v1/loras/{lora_name}`
- **Response**: `{"status": "success", "message": "string"}`
-
-### List Models
- **Endpoint**: `GET /v1/models`
- **Response**: OpenAI-compatible models list
-
-### Chat Completions
- **Endpoint**: `POST /v1/chat/completions`
- **Body**: OpenAI-compatible chat completion request
- **Response**: OpenAI-compatible chat completion response
+Launches two vLLM workers behind a KV-aware router. Load the LoRA to both workers (ports 8081 and 8082), then requests are routed with KV cache affinity for better cache hit rates.
--- a/examples/backends/vllm/launch/lora/setup_minio.sh
+++ b/examples/backends/vllm/launch/lora/setup_minio.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-# Script to setup MinIO and upload LoRA adapters from Hugging Face Hub
-
-set -e
-
-# Get the directory where this script is located
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Colors for output
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-
-# Configuration
-MINIO_DATA_DIR="${HOME}/dynamo_minio_data"
-MINIO_ENDPOINT="http://localhost:9000"
-MINIO_ACCESS_KEY="minioadmin"
-MINIO_SECRET_KEY="minioadmin"
-BUCKET_NAME="my-loras"
-
-# Default LoRA to download (can be overridden)
-HF_LORA_REPO="${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
-LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
-# TEMP_DIR will be created using mktemp when needed
-TEMP_DIR=""
-
-# HF_CLI_CMD will be set to either "hf" or "huggingface-cli" based on huggingface-hub python package version
-# Starting from HF v0.34.0, the `huggingface-cli` command is deprecated in favor of `hf`.
-# Please refer to https://huggingface.co/blog/hf-cli for more details.
-HF_CLI_CMD=""
-
-# Parse command line arguments
-MODE="full"
-if [ "$1" = "--start" ]; then
-    MODE="start"
-elif [ "$1" = "--stop" ]; then
-    MODE="stop"
-elif [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
-    MODE="help"
-elif [ -n "$1" ]; then
-    echo -e "${RED}Error: Unknown option '$1'${NC}"
-    MODE="help"
-fi
-
-print_info() {
-    echo -e "${YELLOW}→ $1${NC}"
-}
-
-print_success() {
-    echo -e "${GREEN}✓ $1${NC}"
-}
-
-print_error() {
-    echo -e "${RED}✗ $1${NC}"
-}
-
-# Show help message
-show_help() {
-    echo "Usage: $0 [OPTIONS]"
-    echo ""
-    echo "Setup MinIO and upload LoRA adapters from Hugging Face Hub"
-    echo ""
-    echo "Options:"
-    echo "  (no options)  Run full setup: start MinIO, download and upload LoRA"
-    echo "  --start       Only start MinIO container"
-    echo "  --stop        Stop and remove MinIO container"
-    echo "  --help, -h    Show this help message"
-    echo ""
-    echo "Environment Variables:"
-    echo "  HF_LORA_REPO  Hugging Face repository (default: ${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora})"
-    echo "  LORA_NAME     Local name for the LoRA (default: ${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora})"
-    echo ""
-    echo "Examples:"
-    echo "  $0                                    # Full setup"
-    echo "  $0 --start                            # Start MinIO only"
-    echo "  $0 --stop                             # Stop MinIO"
-    echo "  HF_LORA_REPO=user/repo $0             # Use custom LoRA"
-    echo ""
-}
-
-# Check if required tools are installed
-check_dependencies() {
-    print_info "Checking dependencies..."
-
-    if ! command -v docker &> /dev/null; then
-        echo "Error: docker is not installed"
-        exit 1
-    fi
-
-    if ! command -v aws &> /dev/null; then
-        echo "Error: aws-cli is not installed. Install with: pip install awscli"
-        exit 1
-    fi
-
-    # Check for either hf or huggingface-cli
-    if command -v hf &> /dev/null; then
-        HF_CLI_CMD="hf"
-        print_success "Found Hugging Face CLI: hf ($(hf version))"
-    elif command -v huggingface-cli &> /dev/null; then
-        HF_CLI_CMD="huggingface-cli"
-        print_success "Found Hugging Face CLI: huggingface-cli ($(huggingface-cli version))"
-    else
-        echo "Error: Neither 'hf' nor 'huggingface-cli' is installed. Install with: pip install huggingface-hub[cli]"
-        exit 1
-    fi
-
-    print_success "All dependencies are installed"
-}
-
-# Start MinIO using Docker
-start_minio() {
-    print_info "Setting up MinIO..."
-
-    # Create data directory
-    mkdir -p "${MINIO_DATA_DIR}"
-
-    # Stop and remove existing container if it exists
-    docker stop dynamo-minio 2>/dev/null || true
-    docker rm dynamo-minio 2>/dev/null || true
-
-    # Start MinIO
-    print_info "Starting MinIO container..."
-    docker run -d \
-        --name dynamo-minio \
-        -p 9000:9000 \
-        -p 9001:9001 \
-        -v "${MINIO_DATA_DIR}:/data" \
-        quay.io/minio/minio server /data \
-        --console-address ":9001"
-
-    # Wait for MinIO to be ready
-    print_info "Waiting for MinIO to be ready..."
-    for i in {1..30}; do
-        if curl -s ${MINIO_ENDPOINT}/minio/health/live > /dev/null 2>&1; then
-            print_success "MinIO is ready"
-            break
-        fi
-        if [ $i -eq 30 ]; then
-            echo "Error: MinIO did not start in time"
-            exit 1
-        fi
-        sleep 1
-    done
-
-    print_success "MinIO started successfully"
-    echo "  - MinIO API: ${MINIO_ENDPOINT}"
-    echo "  - MinIO Console: http://localhost:9001"
-    echo "  - Username: ${MINIO_ACCESS_KEY}"
-    echo "  - Password: ${MINIO_SECRET_KEY}"
-}
-
-# Configure AWS CLI for MinIO
-configure_aws_cli() {
-    print_info "Configuring AWS CLI for MinIO..."
-
-    export AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}"
-    export AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}"
-    export AWS_ENDPOINT_URL="${MINIO_ENDPOINT}"
-
-    # Create bucket if it doesn't exist
-    if ! aws --endpoint-url=${MINIO_ENDPOINT} s3 ls s3://${BUCKET_NAME} 2>/dev/null; then
-        print_info "Creating bucket: ${BUCKET_NAME}"
-        aws --endpoint-url=${MINIO_ENDPOINT} s3 mb s3://${BUCKET_NAME}
-        print_success "Bucket created"
-    else
-        print_success "Bucket already exists: ${BUCKET_NAME}"
-    fi
-}
-
-# Download LoRA from Hugging Face Hub
-download_lora_from_hf() {
-    print_info "Downloading LoRA from Hugging Face Hub..."
-    echo "  - Repository: ${HF_LORA_REPO}"
-    echo "  - Local name: ${LORA_NAME}"
-
-    # Create temporary directory using mktemp (global variable for cleanup)
-    TEMP_DIR=$(mktemp -d -t lora_download_XXXXXX)
-
-    # Download LoRA adapter files using the detected CLI
-    print_info "Downloading adapter files using ${HF_CLI_CMD}..."
-    if [ "${HF_CLI_CMD}" = "huggingface-cli" ]; then
-        huggingface-cli download "${HF_LORA_REPO}" \
-            --local-dir "${TEMP_DIR}" \
-            --local-dir-use-symlinks False
-    else
-        hf download "${HF_LORA_REPO}" \
-            --local-dir "${TEMP_DIR}"
-    fi
-
-    print_success "LoRA downloaded to ${TEMP_DIR}"
-
-    rm -rf "${TEMP_DIR}/.cache"
-    # List downloaded files
-    echo "Downloaded files:"
-    ls -lh "${TEMP_DIR}"
-}
-
-# Upload LoRA to MinIO
-upload_lora_to_minio() {
-    print_info "Uploading LoRA to MinIO..."
-
-    # Upload all files to S3
-    aws --endpoint-url=${MINIO_ENDPOINT} s3 sync \
-        "${TEMP_DIR}" \
-        "s3://${BUCKET_NAME}/${LORA_NAME}" \
-        --exclude "*.git*"
-
-    print_success "LoRA uploaded to s3://${BUCKET_NAME}/${LORA_NAME}"
-
-    # List uploaded files
-    echo "Uploaded files:"
-    aws --endpoint-url=${MINIO_ENDPOINT} s3 ls "s3://${BUCKET_NAME}/${LORA_NAME}/" --recursive
-}
-
-# Cleanup temp files
-cleanup() {
-    if [ -n "${TEMP_DIR}" ] && [ -d "${TEMP_DIR}" ]; then
-        print_info "Cleaning up temporary files..."
-        rm -rf "${TEMP_DIR}"
-        print_success "Cleanup complete"
-    fi
-}
-
-# Stop MinIO
-stop_minio() {
-    print_info "Stopping MinIO..."
-
-    if docker ps | grep -q dynamo-minio; then
-        docker stop dynamo-minio 2>/dev/null
-        print_success "MinIO container stopped"
-    else
-        print_info "MinIO container is not running"
-    fi
-
-    if docker ps -a | grep -q dynamo-minio; then
-        docker rm dynamo-minio 2>/dev/null
-        print_success "MinIO container removed"
-    fi
-
-    echo ""
-    echo "MinIO has been stopped."
-    echo "Data is preserved in: ${MINIO_DATA_DIR}"
-    echo ""
-    echo "To start MinIO again:"
-    echo "  $0 --start"
-    echo ""
-}
-
-# Start MinIO only (without downloading/uploading LoRA)
-start_only() {
-    echo "========================================"
-    echo "Starting MinIO"
-    echo "========================================"
-    echo ""
-
-    start_minio
-    echo ""
-
-    echo "========================================"
-    echo "MinIO Started!"
-    echo "========================================"
-    echo ""
-    echo "MinIO is now running."
-    echo ""
-    echo "To upload a LoRA, run the full setup:"
-    echo "  $0"
-    echo ""
-    echo "Or manually upload using AWS CLI:"
-    echo "  export AWS_ACCESS_KEY_ID=${MINIO_ACCESS_KEY}"
-    echo "  export AWS_SECRET_ACCESS_KEY=${MINIO_SECRET_KEY}"
-    echo "  aws --endpoint-url=${MINIO_ENDPOINT} s3 cp your-lora/ s3://${BUCKET_NAME}/your-lora/ --recursive"
-    echo ""
-    echo "To stop MinIO:"
-    echo "  $0 --stop"
-    echo ""
-}
-
-# Full setup (start MinIO + download/upload LoRA)
-full_setup() {
-    echo "========================================"
-    echo "MinIO Setup & LoRA Upload Script"
-    echo "========================================"
-    echo ""
-
-    check_dependencies
-    echo ""
-
-    start_minio
-    echo ""
-
-    configure_aws_cli
-    echo ""
-
-    download_lora_from_hf
-    echo ""
-
-    upload_lora_to_minio
-    echo ""
-
-    cleanup
-    echo ""
-
-    echo "========================================"
-    echo "Setup Complete!"
-    echo "========================================"
-    echo ""
-    echo "MinIO is running and LoRA has been uploaded."
-    echo ""
-    echo "Next steps:"
-    echo "  1. Run the Dynamo service with LoRA support:"
-    echo "     ${SCRIPT_DIR}/agg_lora.sh"
-    echo ""
-    echo "  2. Load the LoRA adapter:"
-    echo "     curl -X POST http://localhost:8081/v1/loras \\"
-    echo "       -H \"Content-Type: application/json\" \\"
-    echo "       -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://${BUCKET_NAME}/${LORA_NAME}\"}}'"
-    echo ""
-    echo "  3. Run inference with the LoRA:"
-    echo "     curl -X POST http://localhost:8000/v1/chat/completions \\"
-    echo "       -H \"Content-Type: application/json\" \\"
-    echo "       -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"your prompt here\"}]}'"
-    echo ""
-    echo "To stop MinIO:"
-    echo "  $0 --stop"
-    echo ""
-}
-
-# Main execution
-case "$MODE" in
-    start)
-        start_only
-        ;;
-    stop)
-        stop_minio
-        ;;
-    help)
-        show_help
-        exit 0
-        ;;
-    full)
-        full_setup
-        ;;
-    *)
-        echo "Error: Unknown mode '$MODE'"
-        show_help
-        exit 1
-        ;;
-esac
-
--- a/examples/backends/vllm/launch/lora/setup_minio.sh
+++ b/examples/backends/vllm/launch/lora/setup_minio.sh
+../../../../common/setup_minio.sh
\ No newline at end of file
--- a/examples/common/lora.md
+++ b/examples/common/lora.md
+# S3-compatible Storage Backend LoRA Integration Guide
+
+This guide explains how to set up and use LoRA (Low-Rank Adaptation) adapters with Dynamo using S3-compatible storage backend (e.g. MinIO, AWS S3, GCS, etc.).
+
+## Overview
+
+This example demonstrates how to:
+1. Set up MinIO as a local S3-compatible storage
+2. Download LoRA adapters from Hugging Face Hub
+3. Upload LoRA adapters to MinIO
+4. Load and use LoRA adapters with Dynamo
+5. Run inference with LoRA-adapted models
+6. Manage (load/unload) LoRA adapters
+
+## Prerequisites
+
+### Required Software
+- Docker (for running MinIO)
+- Python 3.10+
+- AWS CLI: `pip install awscli`
+- Hugging Face CLI: `pip install huggingface-hub[cli]`
+- jq (optional, for pretty JSON output): `sudo apt install jq`
+
+### Python Dependencies
+Make sure you have Dynamo installed with your chosen backend. See the
+[Dynamo quickstart guide](https://docs.nvidia.com/dynamo/getting-started/quickstart)
+for setup instructions.
+
+## Quick Start
+
+### Step 1: Setup MinIO and Upload LoRA
+
+Run the setup script to start MinIO and download/upload a LoRA adapter from Hugging Face:
+
+```bash
+./setup_minio.sh
+```
+
+This script will:
+- Start MinIO in a Docker container
+- Download a LoRA adapter from Hugging Face Hub (default: `codelion/Qwen3-0.6B-accuracy-recovery-lora`)
+- Upload the LoRA to MinIO at `s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora`
+
+#### Script Options
+
+The setup script supports different modes:
+
+```bash
+# Full setup (default) - start MinIO, download & upload LoRA
+./setup_minio.sh
+
+# Start MinIO only (without downloading/uploading)
+./setup_minio.sh --start
+
+# Stop MinIO
+./setup_minio.sh --stop
+
+# Show help
+./setup_minio.sh --help
+```
+
+#### Customize the LoRA to Download
+
+You can specify a different LoRA repository and name:
+
+```bash
+HF_LORA_REPO="username/lora-repo" \
+LORA_NAME="my-lora" \
+  ./setup_minio.sh
+```
+
+### Step 2: Launch Dynamo with LoRA Support
+
+Start the Dynamo frontend and worker with LoRA support enabled:
+
+```bash
+./agg_lora.sh
+```
+
+This will:
+- Set up AWS credentials for MinIO
+- Start the Dynamo frontend on port 8000
+- Start the Dynamo worker on port 8081 with LoRA support
+
+Wait for the services to start (check the logs for "Application startup complete").
+
+## Working with LoRAs
+
+### 1. Check Available Models
+
+List all available models (base model only at first):
+
+```bash
+curl http://localhost:8000/v1/models | jq .
+```
+
+### 2. Load a LoRA Adapter
+
+Load a LoRA from S3-compatible storage backend (e.g. MinIO):
+
+```bash
+curl -X POST http://localhost:8081/v1/loras \
+  -H "Content-Type: application/json" \
+  -d '{
+    "lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
+    "source": {
+      "uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
+    }
+  }' | jq .
+```
+
+Expected response:
+```json
+{
+  "status": "success",
+  "message": "LoRA adapter 'codelion/Qwen3-0.6B-accuracy-recovery-lora' loaded successfully",
+  "lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
+  "lora_id": 1207343256
+}
+```
+
+### 3. List Loaded LoRAs
+
+Check which LoRAs are currently loaded:
+
+```bash
+curl http://localhost:8081/v1/loras | jq .
+```
+
+### 4. Verify LoRA in Models List
+
+After loading, the LoRA should appear in the models list:
+
+```bash
+curl http://localhost:8000/v1/models | jq .
+```
+
+You should see both the base model and the LoRA adapter listed.
+
+### 5. Run Inference with LoRA
+
+#### Using the LoRA-adapted model:
+
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
+    "messages": [{
+      "role": "user",
+      "content": "What is good low risk investment strategy?"
+    }],
+    "max_tokens": 300,
+    "temperature": 0.1
+  }' | jq .
+```
+
+#### For comparison, using the base model:
+
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [{
+      "role": "user",
+      "content": "What is good low risk investment strategy?"
+    }],
+    "max_tokens": 300
+  }' | jq .
+```
+
+### 6. Unload a LoRA
+
+When you no longer need a LoRA, unload it to free up resources:
+
+```bash
+curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora | jq .
+```
+
+Expected response:
+```json
+{
+  "status": "success",
+  "message": "LoRA unloaded successfully"
+}
+```
+
+After unloading, the LoRA will be removed from both `/v1/loras` and `/v1/models` endpoints.
+
+## Configuration
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+```bash
+# S3-compatible storage backend Configuration
+export AWS_ENDPOINT=http://localhost:9000
+export AWS_ACCESS_KEY_ID=minioadmin
+export AWS_SECRET_ACCESS_KEY=minioadmin
+export AWS_REGION=us-east-1
+
+# Dynamo LoRA Configuration
+export DYN_LORA_ENABLED=true
+export DYN_LORA_PATH=/tmp/dynamo_loras_minio
+```
+
+### MinIO Console
+
+Access the MinIO web console at `http://localhost:9001`
+- Username: `minioadmin`
+- Password: `minioadmin`
+
+## Troubleshooting
+
+### MinIO won't start
+- Check if ports 9000 and 9001 are already in use
+- Ensure Docker is running
+- Check Docker logs: `docker logs dynamo-minio`
+- Try stopping any existing MinIO containers: `./setup_minio.sh --stop`
+- Restart MinIO: `./setup_minio.sh --start`
+
+### LoRA fails to load
+- Verify the LoRA is uploaded to MinIO: `aws --endpoint-url=http://localhost:9000 s3 ls s3://my-loras/`
+- Check AWS credentials are set correctly
+- Ensure the LoRA files are compatible with the base model
+- Check worker logs for detailed error messages
+
+### Inference fails
+- Verify the model name matches exactly (case-sensitive)
+- Check if the LoRA is loaded: `curl http://localhost:8081/v1/loras`
+- Ensure the base model supports the LoRA rank
+- Check that max_lora_rank in the worker config is >= the LoRA rank
+
+### Cache issues
+- Check the cache directory: `ls -la /tmp/dynamo_loras_minio/`
+- Clear the cache if needed: `rm -rf /tmp/dynamo_loras_minio/*`
+- Ensure the cache directory is writable
+
+## Advanced Usage
+
+### Loading Multiple LoRAs
+
+You can load multiple LoRA adapters simultaneously:
+
+```bash
+# Load first LoRA
+curl -X POST http://localhost:8081/v1/loras \
+  -H "Content-Type: application/json" \
+  -d '{"lora_name": "lora1", "source": {"uri": "s3://my-loras/lora1"}}'
+
+# Load second LoRA
+curl -X POST http://localhost:8081/v1/loras \
+  -H "Content-Type: application/json" \
+  -d '{"lora_name": "lora2", "source": {"uri": "s3://my-loras/lora2"}}'
+```
+
+### Using Different Base Models
+
+To use a different base model, modify the `MODEL` environment variable:
+
+```bash
+MODEL=meta-llama/Llama-2-7b-hf ./agg_lora.sh
+```
+
+Ensure your LoRAs are compatible with the chosen base model.
+
+## Cleanup
+
+### Stop Services
+
+Press `Ctrl+C` in the terminal running `agg_lora.sh` to stop Dynamo services.
+
+### Stop MinIO
+
+```bash
+# Using the setup script (recommended)
+./setup_minio.sh --stop
+
+# Or manually with Docker
+docker stop dynamo-minio
+docker rm dynamo-minio
+```
+
+### Clean Up Data
+
+```bash
+# Remove MinIO data
+rm -rf ~/dynamo_minio_data
+
+# Remove LoRA cache
+rm -rf /tmp/dynamo_loras_minio
+```
+
+## API Reference
+
+### Load LoRA
+- **Endpoint**: `POST /v1/loras`
+- **Body**: `{"lora_name": "string", "source": {"uri": "string"}}`
+- **Response**: `{"status": "success", "lora_id": int}`
+
+### List LoRAs
+- **Endpoint**: `GET /v1/loras`
+- **Response**: Array of loaded LoRAs
+
+### Unload LoRA
+- **Endpoint**: `DELETE /v1/loras/{lora_name}`
+- **Response**: `{"status": "success", "message": "string"}`
+
+### List Models
+- **Endpoint**: `GET /v1/models`
+- **Response**: OpenAI-compatible models list
+
+### Chat Completions
+- **Endpoint**: `POST /v1/chat/completions`
+- **Body**: OpenAI-compatible chat completion request
+- **Response**: OpenAI-compatible chat completion response
--- a/examples/common/setup_minio.sh
+++ b/examples/common/setup_minio.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Shared script to set up MinIO and upload LoRA adapters from Hugging Face Hub.
+# Backend-agnostic: symlink from any backend's lora/ directory.
+# SCRIPT_DIR resolves to the directory of the symlink, not this file's location,
+# so "Next steps" messages correctly reference the backend's launch script.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+# Configuration
+MINIO_DATA_DIR="${HOME}/dynamo_minio_data"
+MINIO_ENDPOINT="http://localhost:9000"
+MINIO_ACCESS_KEY="minioadmin"
+MINIO_SECRET_KEY="minioadmin"
+BUCKET_NAME="my-loras"
+
+# Default LoRA (override via env vars)
+HF_LORA_REPO="${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
+LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
+TEMP_DIR=""
+
+# HF CLI: "hf" (v0.34.0+) or "huggingface-cli" (legacy)
+HF_CLI_CMD=""
+
+# Parse args
+MODE="full"
+case "${1:-}" in
+    --start)  MODE="start" ;;
+    --stop)   MODE="stop" ;;
+    -h|--help) MODE="help" ;;
+    "")       MODE="full" ;;
+    *)        echo -e "${RED}Error: Unknown option '$1'${NC}"; MODE="help" ;;
+esac
+
+info()    { echo -e "${YELLOW}-> $1${NC}"; }
+success() { echo -e "${GREEN}ok $1${NC}"; }
+
+show_help() {
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Setup MinIO and upload LoRA adapters from Hugging Face Hub.
+
+Options:
+  (none)      Full setup: start MinIO, download and upload LoRA
+  --start     Start MinIO container only
+  --stop      Stop and remove MinIO container
+  -h, --help  Show this help
+
+Environment Variables:
+  HF_LORA_REPO  HF repository (default: $HF_LORA_REPO)
+  LORA_NAME     Name for the LoRA (default: $LORA_NAME)
+
+Examples:
+  $0                                # Full setup
+  $0 --start                        # Start MinIO only
+  $0 --stop                         # Stop MinIO
+  HF_LORA_REPO=user/repo $0        # Custom LoRA
+EOF
+}
+
+check_dependencies() {
+    info "Checking dependencies..."
+    command -v docker &>/dev/null || { echo "Error: docker not installed"; exit 1; }
+    command -v aws &>/dev/null    || { echo "Error: aws-cli not installed (pip install awscli)"; exit 1; }
+
+    if command -v hf &>/dev/null; then
+        HF_CLI_CMD="hf"
+    elif command -v huggingface-cli &>/dev/null; then
+        HF_CLI_CMD="huggingface-cli"
+    else
+        echo "Error: Neither 'hf' nor 'huggingface-cli' installed (pip install huggingface-hub[cli])"
+        exit 1
+    fi
+    success "Dependencies OK (HF CLI: ${HF_CLI_CMD})"
+}
+
+start_minio() {
+    info "Setting up MinIO..."
+    mkdir -p "${MINIO_DATA_DIR}"
+    docker stop dynamo-minio 2>/dev/null || true
+    docker rm dynamo-minio 2>/dev/null || true
+
+    docker run -d --name dynamo-minio \
+        -p 9000:9000 -p 9001:9001 \
+        -v "${MINIO_DATA_DIR}:/data" \
+        quay.io/minio/minio server /data --console-address ":9001"
+
+    info "Waiting for MinIO..."
+    for i in {1..30}; do
+        curl -s ${MINIO_ENDPOINT}/minio/health/live >/dev/null 2>&1 && break
+        [ $i -eq 30 ] && { echo "Error: MinIO did not start in time"; exit 1; }
+        sleep 1
+    done
+    success "MinIO ready (API: ${MINIO_ENDPOINT}, Console: http://localhost:9001)"
+}
+
+configure_aws_cli() {
+    export AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}"
+    export AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}"
+    export AWS_ENDPOINT_URL="${MINIO_ENDPOINT}"
+
+    if ! aws --endpoint-url=${MINIO_ENDPOINT} s3 ls s3://${BUCKET_NAME} 2>/dev/null; then
+        aws --endpoint-url=${MINIO_ENDPOINT} s3 mb s3://${BUCKET_NAME}
+        success "Bucket created: ${BUCKET_NAME}"
+    else
+        success "Bucket exists: ${BUCKET_NAME}"
+    fi
+}
+
+download_lora_from_hf() {
+    info "Downloading LoRA: ${HF_LORA_REPO}..."
+    TEMP_DIR=$(mktemp -d -t lora_download_XXXXXX)
+
+    if [ "${HF_CLI_CMD}" = "huggingface-cli" ]; then
+        huggingface-cli download "${HF_LORA_REPO}" \
+            --local-dir "${TEMP_DIR}" --local-dir-use-symlinks False
+    else
+        hf download "${HF_LORA_REPO}" --local-dir "${TEMP_DIR}"
+    fi
+
+    rm -rf "${TEMP_DIR}/.cache"
+    success "Downloaded to ${TEMP_DIR}"
+}
+
+upload_lora_to_minio() {
+    info "Uploading to s3://${BUCKET_NAME}/${LORA_NAME}..."
+    aws --endpoint-url=${MINIO_ENDPOINT} s3 sync \
+        "${TEMP_DIR}" "s3://${BUCKET_NAME}/${LORA_NAME}" --exclude "*.git*"
+    success "Upload complete"
+}
+
+cleanup() {
+    [ -n "${TEMP_DIR}" ] && [ -d "${TEMP_DIR}" ] && rm -rf "${TEMP_DIR}"
+}
+
+stop_minio() {
+    info "Stopping MinIO..."
+    docker stop dynamo-minio 2>/dev/null && success "Stopped" || info "Not running"
+    docker rm dynamo-minio 2>/dev/null && success "Removed" || true
+    echo "Data preserved in: ${MINIO_DATA_DIR}"
+}
+
+# --- Main ---
+case "$MODE" in
+    help)
+        show_help; exit 0 ;;
+    stop)
+        stop_minio ;;
+    start)
+        start_minio ;;
+    full)
+        check_dependencies
+        start_minio
+        configure_aws_cli
+        download_lora_from_hf
+        upload_lora_to_minio
+        cleanup
+        echo ""
+        echo "Setup complete. Next steps:"
+        echo "  1. Launch:  ${SCRIPT_DIR}/agg_lora.sh"
+        echo "  2. Load:    curl -X POST http://localhost:8081/v1/loras \\"
+        echo "                -H 'Content-Type: application/json' \\"
+        echo "                -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://${BUCKET_NAME}/${LORA_NAME}\"}}'"
+        echo "  3. Infer:   curl http://localhost:8000/v1/chat/completions \\"
+        echo "                -H 'Content-Type: application/json' \\"
+        echo "                -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'"
+        echo "  4. Stop:    $0 --stop"
+        ;;
+esac
--- a/tests/serve/lora_utils.py
+++ b/tests/serve/lora_utils.py
@@ -22,6 +22,7 @@ import boto3
 import requests
 from botocore.client import Config
 from botocore.exceptions import ClientError
+from huggingface_hub import snapshot_download

 if TYPE_CHECKING:
    from mypy_boto3_s3.client import S3Client
@@ -237,29 +238,17 @@ class MinioService:
            f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"
        )

-        # Run with HF_HUB_OFFLINE unset so the download works even when
+        # Temporarily unset HF_HUB_OFFLINE so the download works even when
        # the predownload_models fixture has already enabled offline mode.
-        # This only affects the subprocess env; the parent process is unchanged.
-        env = os.environ.copy()
-        env.pop("HF_HUB_OFFLINE", None)
-
-        result = subprocess.run(
-            [
-                "huggingface-cli",
-                "download",
+        old_offline = os.environ.pop("HF_HUB_OFFLINE", None)
+        try:
+            snapshot_download(
                self.config.lora_repo,
-                "--local-dir",
-                self._temp_download_dir,
-                "--local-dir-use-symlinks",
-                "False",
-            ],
-            capture_output=True,
-            text=True,
-            env=env,
-        )
-
-        if result.returncode != 0:
-            raise RuntimeError(f"Failed to download LoRA: {result.stderr}")
+                local_dir=self._temp_download_dir,
+            )
+        finally:
+            if old_offline is not None:
+                os.environ["HF_HUB_OFFLINE"] = old_offline

        # Clean up cache directory
        cache_dir = os.path.join(self._temp_download_dir, ".cache")

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -5,6 +5,7 @@ import dataclasses
 import logging
 import os
 from dataclasses import dataclass, field
+from typing import Optional

 import pytest

@@ -14,6 +15,7 @@ from tests.serve.common import (
    params_with_model_mark,
    run_serve_deployment,
 )
+from tests.serve.lora_utils import MinioLoraConfig
 from tests.utils.constants import DefaultPort
 from tests.utils.engine_process import EngineConfig
 from tests.utils.payload_builder import (
@@ -28,6 +30,7 @@ from tests.utils.payload_builder import (
    responses_payload_default,
    responses_stream_payload_default,
 )
+from tests.utils.payloads import LoraTestChatPayload

 logger = logging.getLogger(__name__)

@@ -498,3 +501,97 @@ def test_sglang_disagg_dp_attention(
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

    # Kept for reference; this test uses a different launch path and is skipped
+
+
+# ── LoRA Tests ──────────────────────────────────────────────────────────────
+
+lora_dir = os.path.join(sglang_dir, "launch/lora")
+
+
+def lora_chat_payload(
+    lora_name: str,
+    s3_uri: str,
+    system_port: int = DefaultPort.SYSTEM1.value,
+    repeat_count: int = 2,
+    expected_response: Optional[list] = None,
+    expected_log: Optional[list] = None,
+    max_tokens: int = 100,
+    temperature: float = 0.0,
+) -> LoraTestChatPayload:
+    """Create a LoRA-enabled chat payload for testing"""
+    return LoraTestChatPayload(
+        body={
+            "model": lora_name,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is deep learning? Answer in one sentence.",
+                }
+            ],
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stream": False,
+        },
+        lora_name=lora_name,
+        s3_uri=s3_uri,
+        system_port=system_port,
+        repeat_count=repeat_count,
+        expected_response=expected_response
+        or ["learning", "neural", "network", "AI", "model"],
+        expected_log=expected_log or [],
+    )
+
+
+@pytest.mark.sglang
+@pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.model("Qwen/Qwen3-0.6B")
+@pytest.mark.profiled_vram_gib(4.7)
+@pytest.mark.requested_sglang_kv_tokens(2848)
+@pytest.mark.timeout(158)
+@pytest.mark.pre_merge
+def test_sglang_lora_aggregated(
+    request,
+    runtime_services_dynamic_ports,
+    predownload_models,
+    minio_lora_service,
+    dynamo_dynamic_ports,
+):
+    """
+    Test LoRA inference with aggregated SGLang deployment.
+
+    This test:
+    1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
+    2. Starts SGLang with LoRA support enabled
+    3. Loads the LoRA adapter via system API
+    4. Runs inference with the LoRA model
+    """
+    minio_config: MinioLoraConfig = minio_lora_service
+
+    lora_payload = lora_chat_payload(
+        lora_name=minio_config.lora_name,
+        s3_uri=minio_config.get_s3_uri(),
+        system_port=DefaultPort.SYSTEM1.value,
+        repeat_count=2,
+    )
+
+    config = SGLangConfig(
+        name="test_sglang_lora_aggregated",
+        directory=sglang_dir,
+        script_name="lora/agg_lora.sh",
+        marks=[],
+        model="Qwen/Qwen3-0.6B",
+        timeout=158,
+        env=minio_config.get_env_vars(),
+        request_payloads=[lora_payload],
+    )
+
+    config = dataclasses.replace(
+        config, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(
+        config,
+        request,
+        ports=dynamo_dynamic_ports,
+        extra_env=minio_config.get_env_vars(),
+    )