feat: profiling PVC updates for better UX (#2402)

bea92eae · hhzhang16 · GitHub · c91e2e49 · bea92eae · bea92eae
Unverified Commit bea92eae authored Aug 11, 2025 by hhzhang16 Committed by GitHub Aug 11, 2025
5 changed files
--- a/benchmarks/profiler/deploy/pvc-access-pod.yaml
+++ b/benchmarks/profiler/deploy/pvc-access-pod.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pvc-access-pod
+  labels:
+    app: pvc-access
+spec:
+  activeDeadlineSeconds: 300  # Auto-delete after 5 minutes
+  securityContext:
+    runAsNonRoot: true
+    runAsUser: 1000
+    fsGroup: 1000
+  containers:
+  - name: ubuntu
+    image: ubuntu:22.04
+    command: ["/bin/bash"]
+    args: ["-c", "sleep 290"]  # Sleep for slightly less than deadline - tools can be installed via kubectl exec if needed
+    securityContext:
+      allowPrivilegeEscalation: false
+      readOnlyRootFilesystem: false
+      capabilities:
+        drop:
+        - ALL
+    volumeMounts:
+    - name: profiling-storage
+      mountPath: /profiling_results
+    resources:
+      requests:
+        memory: "128Mi"
+        cpu: "100m"
+      limits:
+        memory: "256Mi"
+        cpu: "200m"
+  volumes:
+  - name: profiling-storage
+    persistentVolumeClaim:
+      claimName: profiling-pvc
+  restartPolicy: Never
--- a/benchmarks/profiler/download_pvc_results.py
+++ b/benchmarks/profiler/download_pvc_results.py
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+PVC Results Download Script
+
+This script downloads all relevant profiling results from the profiling PVC to a local directory.
+It creates the necessary access pod, downloads the files, and cleans up automatically.
+
+Usage:
+    python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> [--no-config]
+
+Examples:
+    # Download to ./results directory
+    python3 download_pvc_results.py --namespace <namespace> --output-dir ./results
+
+    # Download to specific directory
+    python3 download_pvc_results.py --namespace <namespace> --output-dir /home/user/profiling_data
+
+    # Download without configuration files
+    python3 download_pvc_results.py --namespace <namespace> --output-dir ./results --no-config
+"""
+
+import argparse
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import List
+
+from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
+
+
+def list_pvc_contents(
+    namespace: str, pod_name: str, skip_config: bool = False
+) -> List[str]:
+    """List contents of the PVC to identify relevant files."""
+    print("Scanning PVC contents...")
+
+    # Build find command with optional config file exclusion
+    find_cmd = [
+        "kubectl",
+        "exec",
+        pod_name,
+        "-n",
+        namespace,
+        "--",
+        "find",
+        "/profiling_results",
+        "-type",
+        "f",
+        "-name",
+        "*.png",
+        "-o",
+        "-name",
+        "*.npz",
+    ]
+
+    # Add config file patterns if not skipping them
+    if not skip_config:
+        find_cmd.extend(
+            [
+                "-o",
+                "-name",
+                "*.yaml",
+                "-o",
+                "-name",
+                "*.yml",
+            ]
+        )
+
+    try:
+        result = run_command(find_cmd, capture_output=True)
+
+        files = [f.strip() for f in result.stdout.split("\n") if f.strip()]
+        config_note = " (excluding config files)" if skip_config else ""
+        print(f"Found {len(files)} relevant files to download{config_note}")
+        return files
+
+    except subprocess.CalledProcessError:
+        print("ERROR: Failed to list PVC contents")
+        sys.exit(1)
+
+
+def download_files(
+    namespace: str, pod_name: str, files: List[str], output_dir: Path
+) -> None:
+    """Download relevant files from PVC to local directory."""
+    if not files:
+        print("No files to download")
+        return
+
+    # Create output directory
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Downloading {len(files)} files to {output_dir}")
+
+    downloaded = 0
+    failed = 0
+
+    for file_path in files:
+        try:
+            # Determine relative path and create local structure
+            rel_path = file_path.replace("/profiling_results/", "")
+
+            # Validate relative path
+            if ".." in rel_path or rel_path.startswith("/"):
+                print(f"  WARNING: Skipping potentially unsafe path: {file_path}")
+                failed += 1
+                continue
+
+            local_file = output_dir / rel_path
+
+            # Ensure the file is within output_dir
+            if not local_file.resolve().is_relative_to(output_dir.resolve()):
+                print(f"  WARNING: Skipping file outside output directory: {file_path}")
+                failed += 1
+                continue
+
+            local_file.parent.mkdir(parents=True, exist_ok=True)
+
+            # Download file
+            run_command(
+                [
+                    "kubectl",
+                    "cp",
+                    f"{namespace}/{pod_name}:{file_path}",
+                    str(local_file),
+                ],
+                capture_output=True,
+            )
+
+            downloaded += 1
+            if downloaded % 5 == 0:  # Progress update every 5 files
+                print(f"  Downloaded {downloaded}/{len(files)} files...")
+
+        except subprocess.CalledProcessError as e:
+            print(f"  WARNING: Failed to download {file_path}: {e}")
+            failed += 1
+
+    print(f"✓ Download completed: {downloaded} successful, {failed} failed")
+
+
+def download_summary_files(
+    namespace: str, pod_name: str, output_dir: Path, skip_config: bool = False
+) -> None:
+    """Download key summary files that might not match the pattern."""
+    summary_files = [
+        "/profiling_results/prefill_performance.png",
+        "/profiling_results/decode_performance.png",
+    ]
+
+    # Add config files if not skipping them
+    if not skip_config:
+        summary_files.append(
+            "/profiling_results/disagg.yaml"
+        )  # In case it was injected
+
+    print("Downloading summary files...")
+
+    for file_path in summary_files:
+        try:
+            # Check if file exists first using subprocess.run directly
+            result = subprocess.run(
+                [
+                    "kubectl",
+                    "exec",
+                    pod_name,
+                    "-n",
+                    namespace,
+                    "--",
+                    "test",
+                    "-f",
+                    file_path,
+                ],
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+
+            if result.returncode != 0:
+                # File doesn't exist, skip silently
+                continue
+
+            # File exists, download it
+            rel_path = file_path.replace("/profiling_results/", "")
+
+            # Validate relative path
+            if ".." in rel_path or rel_path.startswith("/"):
+                print(
+                    f"  ⚠️  Skipped {file_path.split('/')[-1]}: potentially unsafe path"
+                )
+                continue
+
+            local_file = output_dir / rel_path
+
+            # Ensure the file is within output_dir
+            if not local_file.resolve().is_relative_to(output_dir.resolve()):
+                print(
+                    f"  ⚠️  Skipped {file_path.split('/')[-1]}: outside output directory"
+                )
+                continue
+
+            local_file.parent.mkdir(parents=True, exist_ok=True)
+
+            run_command(
+                [
+                    "kubectl",
+                    "cp",
+                    f"{namespace}/{pod_name}:{file_path}",
+                    str(local_file),
+                ],
+                capture_output=True,
+            )
+
+            print(f"  ✓ {rel_path}")
+
+        except Exception as e:
+            # File doesn't exist or failed to download, skip silently
+            print(f"  ⚠️  Skipped {file_path.split('/')[-1]}: {e}")
+            pass
+
+
+def cleanup_access_pod(namespace: str, pod_name: str) -> None:
+    """Clean up the access pod (let it auto-delete via activeDeadlineSeconds)."""
+    print(f"ℹ️  Access pod '{pod_name}' will auto-delete in 5 minutes")
+    print(f"   To delete immediately: kubectl delete pod {pod_name} -n {namespace}")
+
+
+def generate_readme(output_dir: Path, file_count: int) -> None:
+    """Generate a README file explaining the downloaded contents."""
+    readme_content = f"""# Profiling Results
+
+Downloaded {file_count} files from profiling PVC.
+
+## File Structure
+
+### Performance Plots
+- `prefill_performance.png` - Main prefill performance across TP sizes
+- `decode_performance.png` - Main decode performance across TP sizes
+
+### Interpolation Data
+- `selected_prefill_interpolation/raw_data.npz` - Prefill performance data
+- `selected_prefill_interpolation/*.png` - Prefill interpolation plots
+- `selected_decode_interpolation/raw_data.npz` - Decode performance data
+- `selected_decode_interpolation/*.png` - Decode interpolation plots
+
+### Configuration Files
+- `disagg.yaml` - DynamoGraphDeployment configuration used for profiling
+
+### Individual TP Results
+- `prefill_tp*/` - Individual tensor parallelism profiling results
+- `decode_tp*/` - Individual tensor parallelism profiling results
+
+## Loading Data
+
+To load the .npz data files in Python:
+
+```python
+import numpy as np
+
+# Load prefill data
+prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
+print("Prefill data keys:", list(prefill_data.keys()))
+
+# Load decode data
+decode_data = np.load('selected_decode_interpolation/raw_data.npz')
+print("Decode data keys:", list(decode_data.keys()))
+```
+
+Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}
+"""
+
+    readme_path = output_dir / "README.md"
+    with open(readme_path, "w") as f:
+        f.write(readme_content)
+
+    print("📝 Generated README.md with download summary")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download profiling results from PVC to local directory",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    parser.add_argument(
+        "--namespace",
+        "-n",
+        required=True,
+        help="Kubernetes namespace containing the profiling PVC",
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        type=Path,
+        required=True,
+        help="Local directory to download results to",
+    )
+
+    parser.add_argument(
+        "--no-config",
+        action="store_true",
+        help="Skip downloading configuration files (*.yaml, *.yml)",
+    )
+
+    args = parser.parse_args()
+
+    print("📥 PVC Results Download")
+    print("=" * 40)
+
+    # Validate inputs
+    check_kubectl_access(args.namespace)
+
+    # Deploy access pod
+    pod_name = deploy_access_pod(args.namespace)
+
+    # List and download files
+    files = list_pvc_contents(args.namespace, pod_name, args.no_config)
+    download_files(args.namespace, pod_name, files, args.output_dir)
+
+    # Download additional summary files
+    download_summary_files(args.namespace, pod_name, args.output_dir, args.no_config)
+
+    # Generate README
+    generate_readme(args.output_dir, len(files))
+
+    # Cleanup info
+    cleanup_access_pod(args.namespace, pod_name)
+
+    print("\n✅ Download completed!")
+    print(f"📁 Results available at: {args.output_dir.absolute()}")
+    print("📄 See README.md for file descriptions")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/profiler/inject_disagg_config.py
+++ b/benchmarks/profiler/inject_disagg_config.py
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Disagg Config Injection Script
+
+This script copies a DynamoGraphDeployment disagg configuration file into the profiling PVC
+so it can be used by the SLA profiler job. The profiler can then reference this config
+using the DGD_CONFIG_FILE environment variable.
+
+Usage:
+    python3 inject_disagg_config.py --namespace <namespace> [--disagg-config <path>] [--target-path <path>]
+
+Examples:
+    # Use default disagg.yaml from components/backends/vllm/deploy/
+    python3 inject_disagg_config.py --namespace <namespace>
+
+    # Use custom disagg config
+    python3 inject_disagg_config.py --namespace <namespace> --disagg-config ./my-custom-disagg.yaml
+
+    # Use custom target path in PVC
+    python3 inject_disagg_config.py --namespace <namespace> --target-path /profiling_results/custom-disagg.yaml
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
+
+
+def copy_disagg_config(
+    namespace: str, disagg_config_path: Path, target_path: str
+) -> None:
+    """Copy the disagg config file into the PVC via the access pod."""
+    pod_name = "pvc-access-pod"
+
+    if not disagg_config_path.exists():
+        print(f"ERROR: Disagg config file not found: {disagg_config_path}")
+        sys.exit(1)
+
+    print(f"Copying {disagg_config_path} to {target_path} in PVC...")
+
+    # Copy file to pod
+    run_command(
+        [
+            "kubectl",
+            "cp",
+            str(disagg_config_path),
+            f"{namespace}/{pod_name}:{target_path}",
+        ],
+        capture_output=False,
+    )
+
+    # Verify the file was copied
+    result = run_command(
+        ["kubectl", "exec", pod_name, "-n", namespace, "--", "ls", "-la", target_path],
+        capture_output=True,
+    )
+
+    print("✓ Disagg config successfully copied to PVC")
+    print(f"File details: {result.stdout.strip()}")
+
+
+def cleanup_access_pod(namespace: str, keep_pod: bool = True) -> None:
+    """Optionally clean up the access pod."""
+    if keep_pod:
+        print("ℹ️  Access pod 'pvc-access-pod' left running for future use")
+        print(
+            f"   To access PVC: kubectl exec -it pvc-access-pod -n {namespace} -- /bin/bash"
+        )
+        print(f"   To delete pod: kubectl delete pod pvc-access-pod -n {namespace}")
+    else:
+        print("Cleaning up access pod...")
+        run_command(
+            [
+                "kubectl",
+                "delete",
+                "pod",
+                "pvc-access-pod",
+                "-n",
+                namespace,
+                "--ignore-not-found",
+            ],
+            capture_output=False,
+        )
+        print("✓ Access pod deleted")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Inject disagg config into profiling PVC",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    parser.add_argument(
+        "--namespace",
+        "-n",
+        required=True,
+        help="Kubernetes namespace containing the profiling PVC",
+    )
+
+    parser.add_argument(
+        "--disagg-config",
+        type=Path,
+        default=Path("components/backends/vllm/deploy/disagg.yaml"),
+        help="Path to disagg config file (default: components/backends/vllm/deploy/disagg.yaml)",
+    )
+
+    parser.add_argument(
+        "--target-path",
+        default="/profiling_results/disagg.yaml",
+        help="Target path in PVC (default: /profiling_results/disagg.yaml)",
+    )
+
+    parser.add_argument(
+        "--cleanup",
+        action="store_true",
+        help="Delete the access pod after copying (default: keep running)",
+    )
+
+    args = parser.parse_args()
+
+    # Validate target_path to prevent directory traversal
+    if not args.target_path.startswith("/profiling_results/"):
+        print("ERROR: Target path must be within /profiling_results/")
+        sys.exit(1)
+
+    if ".." in args.target_path:
+        print("ERROR: Target path cannot contain '..'")
+        sys.exit(1)
+
+    print("🚀 Disagg Config Injection")
+    print("=" * 40)
+
+    # Validate inputs
+    check_kubectl_access(args.namespace)
+
+    # Deploy access pod
+    deploy_access_pod(args.namespace)
+
+    # Copy disagg config
+    copy_disagg_config(args.namespace, args.disagg_config, args.target_path)
+
+    # Cleanup
+    cleanup_access_pod(args.namespace, keep_pod=not args.cleanup)
+
+    print("\n✅ Disagg config injection completed!")
+    print(f"📁 Config available at: {args.target_path}")
+    print(f"🔧 Set DGD_CONFIG_FILE={args.target_path} in your profiler job")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/profiler/utils/kubernetes.py
+++ b/benchmarks/profiler/utils/kubernetes.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import List
+
+
+def run_command(
+    cmd: List[str], capture_output: bool = True
+) -> subprocess.CompletedProcess:
+    """Run a command and handle errors."""
+    try:
+        result = subprocess.run(
+            cmd, capture_output=capture_output, text=True, check=True
+        )
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"ERROR: Command failed: {' '.join(cmd)}")
+        print(f"Exit code: {e.returncode}")
+        if e.stdout:
+            print(f"STDOUT: {e.stdout}")
+        if e.stderr:
+            print(f"STDERR: {e.stderr}")
+        sys.exit(1)
+
+
+def check_kubectl_access(namespace: str) -> None:
+    """Check if kubectl can access the specified namespace."""
+    print(f"Checking kubectl access to namespace '{namespace}'...")
+    run_command(["kubectl", "get", "pods", "-n", namespace], capture_output=True)
+    print("✓ kubectl access confirmed")
+
+
+def deploy_access_pod(namespace: str) -> str:
+    """Deploy the PVC access pod and return pod name."""
+    pod_name = "pvc-access-pod"
+
+    # Check if pod already exists and is running
+    try:
+        result = subprocess.run(
+            [
+                "kubectl",
+                "get",
+                "pod",
+                pod_name,
+                "-n",
+                namespace,
+                "-o",
+                "jsonpath={.status.phase}",
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+
+        if result.returncode == 0 and result.stdout.strip() == "Running":
+            print(f"✓ Access pod '{pod_name}' already running")
+            return pod_name
+    except Exception:
+        # Pod doesn't exist or isn't running
+        pass
+
+    print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...")
+
+    # Get the directory where this script is located
+    script_dir = Path(__file__).parent
+    pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml"
+
+    if not pod_yaml_path.exists():
+        print(f"ERROR: Pod YAML not found at {pod_yaml_path}")
+        sys.exit(1)
+
+    # Deploy the pod
+    run_command(
+        ["kubectl", "apply", "-f", str(pod_yaml_path), "-n", namespace],
+        capture_output=False,
+    )
+
+    print("Waiting for pod to be ready...")
+
+    # Wait for pod to be ready (up to 60 seconds)
+    for i in range(60):
+        try:
+            result = subprocess.run(
+                [
+                    "kubectl",
+                    "get",
+                    "pod",
+                    pod_name,
+                    "-n",
+                    namespace,
+                    "-o",
+                    "jsonpath={.status.phase}",
+                ],
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+
+            if result.returncode == 0 and result.stdout.strip() == "Running":
+                print("✓ Access pod is ready")
+                return pod_name
+
+        except Exception:
+            pass
+
+        time.sleep(1)
+        if i % 10 == 0:
+            print(f"  Still waiting... ({i+1}s)")
+
+    print("ERROR: Access pod failed to become ready within 60 seconds")
+    sys.exit(1)
--- a/docs/architecture/pre_deployment_profiling.md
+++ b/docs/architecture/pre_deployment_profiling.md
@@ -76,20 +76,56 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
  -n $NAMESPACE
 ```

-**Step 1: Build your own vLLM image for profiling**
+**Step 1: Configure container image**

+You have two options for configuring your profiling setup:
+
+**Option A: Use pre-built image with custom config injection (recommended)**
+
+Use the default pre-built image and inject custom configurations via PVC:
+
+1. **Set the container image:**
+   ```bash
+   export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.0 # or any existing image tag
+   ```
+
+2. **Inject your custom disagg configuration:**
+   ```bash
+   # Use default disagg.yaml config
+   python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE
+
+   # Or use a custom disagg config file
+   python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --disagg-config my-custom-disagg.yaml
+
+   # Or specify a custom target path in the PVC
+   python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --target-path /profiling_results/my-disagg.yaml
+   ```
+
+3. **Set the config path for the profiling job:**
+   ```bash
+   export DGD_CONFIG_FILE=/profiling_results/disagg.yaml # or your custom path
+   ```
+
+This approach allows you to:
+- Customize DGD configurations without rebuilding container images
+- Test different model configurations easily
+- Version control your DGD configs alongside your code
+
+> **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues.
+
+> **Note**: The default location in the PVC is `/profiling_results/disagg.yaml`. If you don't inject a config, the profiler will fall back to the built-in config at `/workspace/components/backends/vllm/deploy/disagg.yaml`.
+
+**Option B: Build custom image (only if you need code changes)**
+
+Only needed if you require custom code modifications beyond configuration changes:
 ```bash
 # in the project's root folder
 ./container/build.sh --framework VLLM
 # Tag and push to your container registry
-export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
-# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
-# Modify this yaml to profile different models
-export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
+export DOCKER_IMAGE=<your docker tag>
+export DGD_CONFIG_FILE=<disagg config path> # path to your disagg.yaml file within the DOCKER_IMAGE
 ```

-Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.
-
 **Step 2: Set SLA target**

 Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.
@@ -149,20 +185,20 @@ After the profiling job completes successfully, the results are stored in the pe

 The profiling results are stored in a PVC named `profiling-pvc`. To access the results:

-1. **Create a temporary pod to access the PVC:**
+1. **Deploy the PVC access pod (if not already running):**
   ```bash
-   kubectl run temp-access --image=alpine:latest --restart=Never \
-     --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
-     -n $NAMESPACE
+   kubectl apply -f benchmarks/profiler/deploy/pvc-access-pod.yaml -n $NAMESPACE
   ```

-2. **Inside the temporary pod, navigate to the results directory:**
+2. **Access the PVC through the pod:**
   ```bash
-   kubectl exec -it temp-access -n $NAMESPACE -- sh
-   cd /workspace/profiling_results
+   kubectl exec -it pvc-access-pod -n $NAMESPACE -- /bin/bash
+   cd /profiling_results
   ls -la
   ```

+> **Note**: The same `pvc-access-pod` is used for both injecting disagg configs and accessing results. If you used the `inject_disagg_config.py` script earlier, the pod may already be running. The pod auto-deletes after 5 minutes of activity.
+
 #### File Structure

 The profiling results directory contains the following structure:
@@ -185,7 +221,33 @@ The profiling results directory contains the following structure:

 #### Downloading Results Locally

-To download the profiling results to your local machine:
+You can download the profiling results using the automated download script or manually:
+
+**Option 1: Automated Download (Recommended)**
+
+Use the provided download script to automatically fetch all relevant files:
+
+```bash
+# Download to ./results directory
+python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results
+
+# Download to specific directory
+python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir /path/to/my/results
+
+# Download without any of the auto-created config.yaml files used in profiling
+python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --no-config
+```
+
+The script will:
+- Deploy a temporary access pod (auto-deletes after 5 minutes)
+- Scan for relevant files (*.png, *.npz, *.yaml)
+- Download all files maintaining directory structure
+- Generate a README.md with file descriptions
+- Clean up automatically
+
+**Option 2: Manual Download**
+
+To download the profiling results manually:

 1. **Download performance plots and data files:**
   ```bash
@@ -193,21 +255,21 @@ To download the profiling results to your local machine:
   mkdir -p ./profiling_results

   # Copy main performance plots
-   kubectl cp temp-access:/workspace/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE
-   kubectl cp temp-access:/workspace/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE
+   kubectl cp pvc-access-pod:/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE
+   kubectl cp pvc-access-pod:/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE

   # Copy interpolation directories (includes additional plots and data)
-   kubectl cp temp-access:/workspace/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r
-   kubectl cp temp-access:/workspace/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r
+   kubectl cp pvc-access-pod:/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r
+   kubectl cp pvc-access-pod:/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r
   ```

 2. **Alternative: Tar and download entire results directory:**
   ```bash
-   # Inside the temporary pod, create a tar archive
-   tar -czf /workspace/profiling_results/profiling_results.tar.gz -C /workspace/profiling_results .
+   # Inside the access pod, create a tar archive
+   tar -czf /profiling_results/profiling_results.tar.gz -C /profiling_results .

   # Download the archive to your local machine
-   kubectl cp temp-access:/workspace/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE
+   kubectl cp pvc-access-pod:/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE

   # Extract locally
   tar -xzf profiling_results.tar.gz -C ./profiling_results/
@@ -244,15 +306,18 @@ print("Decode data keys:", list(decode_data.keys()))

 #### Cleaning Up

-Once you've downloaded your results, clean up the temporary pod:
+The access pod automatically deletes after 5 minutes of activity, but you can also clean it up manually:
+
 ```bash
-# Exit the temporary pod (if still inside)
+# Exit the access pod (if still inside)
 exit

-# The pod should auto-delete due to --rm flag, but if needed:
-kubectl delete pod temp-access -n $NAMESPACE
+# Delete the access pod immediately (optional - it will auto-delete)
+kubectl delete pod pvc-access-pod -n $NAMESPACE
 ```

+> **Note**: The access pod has `activeDeadlineSeconds: 300` and will auto-delete after 5 minutes to prevent resource waste.
+
 ### Troubleshooting

 #### Image Pull Authentication Errors