fix: [lora] refactor test and clean up examples (#4884)

3c2b72b0 · Biswa Panda · GitHub · 242a4d5b · 3c2b72b0 · 3c2b72b0
Unverified Commit 3c2b72b0 authored Dec 11, 2025 by Biswa Panda Committed by GitHub Dec 11, 2025
6 changed files
--- a/examples/backends/vllm/launch/lora/agg_lora.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora.sh
@@ -4,14 +4,6 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

-# Follow the README.md instructions to setup MinIO or upload the LoRA to s3/minio
-# Adjust these values to match your local MinIO or S3 setup
-
-
-# load math lora to minio
-# LORA_NAME=Neural-Hacker/Qwen3-Math-Reasoning-LoRA HF_LORA_REPO=Neural-Hacker/Qwen3-Math-Reasoning-LoRA ./setup_minio.sh
-
-
 export AWS_ENDPOINT=http://localhost:9000
 export AWS_ACCESS_KEY_ID=minioadmin
 export AWS_SECRET_ACCESS_KEY=minioadmin
@@ -21,8 +13,6 @@ export AWS_ALLOW_HTTP=true
 # Dynamo LoRA Configuration
 export DYN_LORA_ENABLED=true
 export DYN_LORA_PATH=/tmp/dynamo_loras_minio
-export DYN_LOG=debug
-# export DYN_LOG_LEVEL=debug

 mkdir -p $DYN_LORA_PATH

@@ -63,7 +53,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-0.6B",
-    "messages": [{"role": "user", "content": "Solve (x*x - x + 1 = 0) for x"}],
+    "messages": [{"role": "user", "content": "What is deep learning?"}],
    "max_tokens": 300,
    "temperature": 0.0
  }'

--- a/examples/backends/vllm/launch/lora/agg_lora_router.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora_router.sh
@@ -4,12 +4,6 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT

-# Follow the README.md instructions to setup MinIO or upload the LoRA to s3/minio
-# Adjust these values to match your local MinIO or S3 setup
-
-# load math lora to minio
-# LORA_NAME=Neural-Hacker/Qwen3-Math-Reasoning-LoRA HF_LORA_REPO=Neural-Hacker/Qwen3-Math-Reasoning-LoRA ./setup_minio.sh
-
 export AWS_ENDPOINT=http://localhost:9000
 export AWS_ACCESS_KEY_ID=minioadmin
 export AWS_SECRET_ACCESS_KEY=minioadmin
@@ -19,8 +13,6 @@ export AWS_ALLOW_HTTP=true
 # Dynamo LoRA Configuration
 export DYN_LORA_ENABLED=true
 export DYN_LORA_PATH=/tmp/dynamo_loras_minio
-export DYN_LOG=debug
-# export DYN_LOG_LEVEL=debug

 mkdir -p $DYN_LORA_PATH

@@ -118,7 +110,7 @@ curl localhost:8000/v1/chat/completions \
    "total_tokens": 226,
    "prompt_tokens_details": {
      "audio_tokens": null,
-      "cached_tokens": 192
+      "cached_tokens": 192              # tokens that were cached from the previous request.
    }
  },
  "nvext": {

--- a/tests/serve/conftest.py
+++ b/tests/serve/conftest.py
@@ -86,8 +86,8 @@ def minio_lora_service():
        local_path = service.download_lora()
        service.upload_lora(local_path)

-        # Clean up downloaded files (keep MinIO running)
-        service.cleanup_temp()
+        # Clean up downloaded files (keep MinIO data intact)
+        service.cleanup_download()

        yield config


--- a/tests/serve/lora_utils.py
+++ b/tests/serve/lora_utils.py
@@ -61,7 +61,7 @@ class MinioService:
    def __init__(self, config: MinioLoraConfig):
        self.config = config
        self._logger = logging.getLogger(self.__class__.__name__)
-        self._temp_dir: Optional[str] = None
+        self._temp_download_dir: Optional[str] = None

    def start(self) -> None:
        """Start MinIO container"""
@@ -183,9 +183,9 @@ class MinioService:

    def download_lora(self) -> str:
        """Download LoRA from Hugging Face Hub, returns temp directory path"""
-        self._temp_dir = tempfile.mkdtemp(prefix="lora_download_")
+        self._temp_download_dir = tempfile.mkdtemp(prefix="lora_download_")
        self._logger.info(
-            f"Downloading LoRA {self.config.lora_repo} to {self._temp_dir}"
+            f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"
        )

        result = subprocess.run(
@@ -194,7 +194,7 @@ class MinioService:
                "download",
                self.config.lora_repo,
                "--local-dir",
-                self._temp_dir,
+                self._temp_download_dir,
                "--local-dir-use-symlinks",
                "False",
            ],
@@ -206,11 +206,11 @@ class MinioService:
            raise RuntimeError(f"Failed to download LoRA: {result.stderr}")

        # Clean up cache directory
-        cache_dir = os.path.join(self._temp_dir, ".cache")
+        cache_dir = os.path.join(self._temp_download_dir, ".cache")
        if os.path.exists(cache_dir):
            shutil.rmtree(cache_dir)

-        return self._temp_dir
+        return self._temp_download_dir

    def upload_lora(self, local_path: str) -> None:
        """Upload LoRA to MinIO"""
@@ -246,11 +246,15 @@ class MinioService:
        if result.returncode != 0:
            raise RuntimeError(f"Failed to upload LoRA: {result.stderr}")

+    def cleanup_download(self) -> None:
+        """Clean up temporary download directory only"""
+        if self._temp_download_dir and os.path.exists(self._temp_download_dir):
+            shutil.rmtree(self._temp_download_dir)
+            self._temp_download_dir = None
+
    def cleanup_temp(self) -> None:
-        """Clean up temporary directories"""
-        if self._temp_dir and os.path.exists(self._temp_dir):
-            shutil.rmtree(self._temp_dir)
-            self._temp_dir = None
+        """Clean up all temporary directories including MinIO data dir"""
+        self.cleanup_download()

        if self.config.data_dir and os.path.exists(self.config.data_dir):
            shutil.rmtree(self.config.data_dir, ignore_errors=True)

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -16,7 +16,7 @@ from tests.serve.common import (
    run_serve_deployment,
 )
 from tests.serve.conftest import MULTIMODAL_IMG_PATH, MULTIMODAL_IMG_URL
-from tests.serve.lora_utils import MinioLoraConfig, load_lora_adapter
+from tests.serve.lora_utils import MinioLoraConfig
 from tests.utils.engine_process import EngineConfig
 from tests.utils.payload_builder import (
    chat_payload,
@@ -26,7 +26,7 @@ from tests.utils.payload_builder import (
    completion_payload_with_logprobs,
    metric_payload_default,
 )
-from tests.utils.payloads import ChatPayload, ToolCallingChatPayload
+from tests.utils.payloads import LoraTestChatPayload, ToolCallingChatPayload

 logger = logging.getLogger(__name__)

@@ -614,93 +614,6 @@ def test_multimodal_b64(request, runtime_services, predownload_models):
 lora_dir = os.path.join(vllm_dir, "launch/lora")


-class LoraTestChatPayload(ChatPayload):
-    """
-    Chat payload that loads a LoRA adapter before sending inference requests.
-
-    This payload first loads the specified LoRA adapter via the system API,
-    then sends chat completion requests using the LoRA model.
-    """
-
-    def __init__(
-        self,
-        body: dict,
-        lora_name: str,
-        s3_uri: str,
-        system_port: int = 8081,
-        repeat_count: int = 1,
-        expected_response: Optional[list] = None,
-        expected_log: Optional[list] = None,
-        timeout: int = 60,
-    ):
-        super().__init__(
-            body=body,
-            repeat_count=repeat_count,
-            expected_response=expected_response or [],
-            expected_log=expected_log or [],
-            timeout=timeout,
-        )
-        self.system_port = system_port
-        self.lora_name = lora_name
-        self.s3_uri = s3_uri
-        self._lora_loaded = False
-
-    def _ensure_lora_loaded(self) -> None:
-        """Ensure the LoRA adapter is loaded before making inference requests"""
-        if not self._lora_loaded:
-            import time
-
-            import requests
-
-            load_lora_adapter(
-                system_port=self.system_port,
-                lora_name=self.lora_name,
-                s3_uri=self.s3_uri,
-                timeout=self.timeout,
-            )
-
-            # Wait for the LoRA model to appear in /v1/models
-            models_url = f"http://{self.host}:{self.port}/v1/models"
-            start_time = time.time()
-            max_wait = 60  # 1 minute timeout
-
-            logger.info(
-                f"Waiting for LoRA model '{self.lora_name}' to appear in /v1/models..."
-            )
-
-            while time.time() - start_time < max_wait:
-                try:
-                    response = requests.get(models_url, timeout=5)
-                    if response.status_code == 200:
-                        data = response.json()
-                        models = data.get("data", [])
-                        model_ids = [m.get("id", "") for m in models]
-
-                        if self.lora_name in model_ids:
-                            logger.info(
-                                f"LoRA model '{self.lora_name}' is now available"
-                            )
-                            self._lora_loaded = True
-                            return
-
-                        logger.debug(
-                            f"Available models: {model_ids}, waiting for '{self.lora_name}'..."
-                        )
-                except requests.RequestException as e:
-                    logger.debug(f"Error checking /v1/models: {e}")
-
-                time.sleep(1)
-
-            raise RuntimeError(
-                f"Timeout: LoRA model '{self.lora_name}' did not appear in /v1/models within {max_wait}s"
-            )
-
-    def url(self) -> str:
-        """Load LoRA before first request, then return URL"""
-        self._ensure_lora_loaded()
-        return super().url()
-
-
 def lora_chat_payload(
    lora_name: str,
    s3_uri: str,

--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -21,6 +21,8 @@ from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional

+import requests
+
 from dynamo import prometheus_names  # type: ignore[attr-defined]

 logger = logging.getLogger(__name__)
@@ -240,6 +242,93 @@ class ToolCallingChatPayload(ChatPayload):
            logger.info(f"Expected tool '{self.expected_tool_name}' was called")


+@dataclass
+class LoraTestChatPayload(ChatPayload):
+    """
+    Chat payload that loads a LoRA adapter before sending inference requests.
+
+    This payload first loads the specified LoRA adapter via the system API,
+    then sends chat completion requests using the LoRA model.
+    """
+
+    def __init__(
+        self,
+        body: dict,
+        lora_name: str,
+        s3_uri: str,
+        system_port: int = 8081,
+        repeat_count: int = 1,
+        expected_response: Optional[list] = None,
+        expected_log: Optional[list] = None,
+        timeout: int = 60,
+    ):
+        super().__init__(
+            body=body,
+            repeat_count=repeat_count,
+            expected_response=expected_response or [],
+            expected_log=expected_log or [],
+            timeout=timeout,
+        )
+        self.system_port = system_port
+        self.lora_name = lora_name
+        self.s3_uri = s3_uri
+        self._lora_loaded = False
+
+    def _ensure_lora_loaded(self) -> None:
+        """Ensure the LoRA adapter is loaded before making inference requests"""
+        if not self._lora_loaded:
+            # Import the load_lora_adapter function
+            # Note: This import is done here to avoid circular dependencies
+            from tests.serve.lora_utils import load_lora_adapter
+
+            load_lora_adapter(
+                system_port=self.system_port,
+                lora_name=self.lora_name,
+                s3_uri=self.s3_uri,
+                timeout=self.timeout,
+            )
+
+            # Wait for the LoRA model to appear in /v1/models
+            models_url = f"http://{self.host}:{self.port}/v1/models"
+            start_time = time.time()
+
+            logger.info(
+                f"Waiting for LoRA model '{self.lora_name}' to appear in /v1/models..."
+            )
+
+            while time.time() - start_time < self.timeout:
+                try:
+                    response = requests.get(models_url, timeout=5)
+                    if response.status_code == 200:
+                        data = response.json()
+                        models = data.get("data", [])
+                        model_ids = [m.get("id", "") for m in models]
+
+                        if self.lora_name in model_ids:
+                            logger.info(
+                                f"LoRA model '{self.lora_name}' is now available"
+                            )
+                            self._lora_loaded = True
+                            return
+
+                        logger.debug(
+                            f"Available models: {model_ids}, waiting for '{self.lora_name}'..."
+                        )
+                except requests.RequestException as e:
+                    logger.debug(f"Error checking /v1/models: {e}")
+
+                time.sleep(1)
+
+            raise RuntimeError(
+                f"Timeout: LoRA model '{self.lora_name}' did not appear in /v1/models within {self.timeout}s"
+            )
+
+    def url(self) -> str:
+        """Load LoRA before first request, then return URL"""
+        self._ensure_lora_loaded()
+        return super().url()
+
+
 @dataclass
 class CompletionPayload(BasePayload):
    """Payload for completions endpoint."""