fix: replace torch.load with safetensors and enable Rust frontend media...

fix: replace torch.load with safetensors and enable Rust frontend media decoding for TRT-LLM multimodal (#8295) Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

fix: replace torch.load with safetensors and enable Rust frontend media...
fix: replace torch.load with safetensors and enable Rust frontend media decoding for TRT-LLM multimodal (#8295) Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
f923777e · Indrajit Bhosale · GitHub · fd361c82 · f923777e · f923777e
Unverified Commit f923777e authored Apr 21, 2026 by Indrajit Bhosale Committed by GitHub Apr 21, 2026
9 changed files
--- a/components/src/dynamo/common/multimodal/image_loader.py
+++ b/components/src/dynamo/common/multimodal/image_loader.py
@@ -137,6 +137,17 @@ class ImageLoader:
        finally:
            self._inflight.pop(key, None)

+    async def _read_and_convert_nixl_image(
+        self, metadata: Dict[str, Any]
+    ) -> Image.Image:
+        """Read decoded image via NIXL and convert numpy array to PIL Image."""
+        assert self._nixl_connector is not None
+        arr = await read_decoded_media_via_nixl(self._nixl_connector, metadata)
+        # TRT-LLM's input processor requires PIL Images (accesses .height/.width
+        # for token count calculation). fromarray() is near-zero-cost: it wraps
+        # the existing numpy buffer without copying pixel data.
+        return Image.fromarray(arr)
+
    @_nvtx.annotate("mm:img:load_image", color="lime")
    async def load_image(self, image_url: str) -> Image.Image:
        parsed_url = urlparse(image_url)
@@ -222,9 +233,7 @@ class ImageLoader:
                    metadata = item[DECODED_VARIANT_KEY]
                    if self._nixl_connector is None:
                        raise RuntimeError("NIXL connector is not initialized")
-                    image_futures.append(
-                        read_decoded_media_via_nixl(self._nixl_connector, metadata)
-                    )
+                    image_futures.append(self._read_and_convert_nixl_image(metadata))
                else:
                    logger.error(
                        "Received Decoded multimodal data but enable_frontend_decoding=False. "

--- a/components/src/dynamo/trtllm/backend_args.py
+++ b/components/src/dynamo/trtllm/backend_args.py
@@ -215,6 +215,17 @@ class DynamoTrtllmArgGroup(ArgGroup):
            arg_type=int,
            help="Maximum size of downloadable embedding files/Image URLs.",
        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--frontend-decoding",
+            env_var="DYN_TRTLLM_FRONTEND_DECODING",
+            default=False,
+            help=(
+                "Enable frontend decoding of multimodal images. "
+                "When enabled, images are decoded in the Rust frontend and transferred to the backend via NIXL RDMA. "
+                "Without this flag, images are decoded in the Python backend (default behavior)."
+            ),
+        )

        # --- Guided Decoding ---
        add_argument(
@@ -479,6 +490,7 @@ class DynamoTrtllmConfig(ConfigBase):
    encode_endpoint: str
    allowed_local_media_path: str
    max_file_size_mb: int
+    frontend_decoding: bool

    default_height: int
    default_width: int

--- a/components/src/dynamo/trtllm/encode_helper.py
+++ b/components/src/dynamo/trtllm/encode_helper.py
@@ -209,8 +209,8 @@ class EncodeHelper:
    # Two supported flows:
    #
    # 1. EMBEDDING-PATH FLOW (Pre-computed embeddings via NIXL)
-    #    - User sends URL ending in .pt/.pth/.bin
-    #    - Encode worker loads tensor, creates NIXL readable op
+    #    - User sends URL ending in .safetensors
+    #    - Encode worker loads tensor (via safetensors), creates NIXL readable op
    #    - Prefill worker reads embeddings via RDMA
    #    - Use case: Customer has pre-computed embeddings from custom encoder
    #
@@ -235,7 +235,7 @@ class EncodeHelper:
        for the prefill worker to read via RDMA.

        Args:
-            embedding_paths: List of paths to embedding files (.pt/.pth/.bin)
+            embedding_paths: List of paths to embedding files (.safetensors)
            multimodal_processor: Processor to load embeddings
            connector: NIXL connector for RDMA transfer

@@ -460,5 +460,5 @@ class EncodeHelper:
        # No valid multimodal content found
        else:
            yield {
-                "error": "No embedding_paths or image_urls found in request, or image_urls without text_prompt or token_ids"
+                "error": "No embedding_paths (.safetensors) or image_urls found in request, or image_urls without text_prompt or token_ids"
            }
--- a/components/src/dynamo/trtllm/multimodal_processor.py
+++ b/components/src/dynamo/trtllm/multimodal_processor.py
@@ -15,13 +15,14 @@

 import logging
 import time
-from io import BytesIO
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Protocol, Tuple
 from urllib.parse import urlparse
-from urllib.request import urlopen

+import httpx
 import torch
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import load_file as safetensors_load_file
 from tensorrt_llm.llmapi.tokenizer import tokenizer_factory

 from dynamo.common.multimodal.image_loader import ImageLoader
@@ -57,6 +58,7 @@ class MultimodalRequestProcessor:
        max_file_size_mb: int,
        tokenizer: Optional[TokenizerProtocol] = None,
        allowed_local_media_path: str = "",
+        enable_frontend_decoding: bool = False,
    ):
        self.model_type = model_type
        self.model_dir = model_dir
@@ -73,7 +75,9 @@ class MultimodalRequestProcessor:
        else:
            self.tokenizer = tokenizer_factory(model_dir)

-        self.image_loader = ImageLoader()
+        self.image_loader = ImageLoader(
+            enable_frontend_decoding=enable_frontend_decoding
+        )

    def is_url(self, path: str) -> bool:
        """Check if a path is a URL."""
@@ -83,45 +87,86 @@ class MultimodalRequestProcessor:
            return False
        return bool(parsed.scheme and parsed.netloc)

-    def load_tensor_from_path_or_url(self, path: str) -> torch.Tensor:
-        """Load a tensor from either a local file path or a URL."""
+    def _unwrap_safetensors(
+        self, data: Dict[str, torch.Tensor]
+    ) -> "torch.Tensor | Dict[str, torch.Tensor]":
+        """Return a single tensor when the file has one key, else the full dict.
+
+        Multi-key files (e.g. Maverick/Scout with mm_embeddings +
+        image_special_tokens + image_special_token_offsets) need the
+        full dict so encode_helper can extract auxiliary data.
+        """
+        if len(data) == 1:
+            return next(iter(data.values()))
+        return data
+
+    def load_tensor_from_path_or_url(
+        self, path: str
+    ) -> "torch.Tensor | Dict[str, torch.Tensor]":
+        """Load tensors from a local .safetensors path or URL.
+
+        Returns a single tensor for single-key files (e.g. LLaVA-NeXT),
+        or a dict of tensors for multi-key files (e.g. Maverick/Scout).
+        Only .safetensors format is accepted.
+        """
+        parsed = urlparse(path)
+        lower_path = parsed.path.lower()
+        if lower_path.endswith((".pt", ".pth", ".bin")):
+            raise RuntimeError(
+                "Unsafe tensor format: .pt/.pth/.bin files are not allowed. "
+                "Use .safetensors format instead."
+            )
+        if not lower_path.endswith(".safetensors"):
+            raise RuntimeError("Only .safetensors embedding files are supported.")
+
        if self.is_url(path):
-            # Download directly to memory using BytesIO (no filesystem ops)
+            if parsed.scheme not in ("http", "https"):
+                raise RuntimeError(f"Unsupported URL scheme: {parsed.scheme}")
            try:
-                with urlopen(path) as response:
-                    # Read at most max_size + 1 bytes to detect if file exceeds limit
-                    data = response.read(self.max_file_size_bytes + 1)
-                    if len(data) > self.max_file_size_bytes:
-                        raise RuntimeError(
-                            f"File size exceeds limit: {len(data) // (1024*1024)}MB > "
-                            f"{self.max_file_size_mb}MB "
-                        )
-                    tensor_stream = BytesIO(data)
-                    tensor = torch.load(
-                        tensor_stream, map_location="cpu", weights_only=True
-                    )
-                    return tensor
+                with httpx.Client(timeout=300.0) as client:
+                    with client.stream("GET", path) as resp:
+                        resp.raise_for_status()
+                        content_length = resp.headers.get("content-length")
+                        if (
+                            content_length
+                            and int(content_length) > self.max_file_size_bytes
+                        ):
+                            raise RuntimeError(
+                                f"File size exceeds limit: "
+                                f"{int(content_length) // (1024*1024)}MB > "
+                                f"{self.max_file_size_mb}MB"
+                            )
+                        chunks = []
+                        downloaded = 0
+                        for chunk in resp.iter_bytes():
+                            downloaded += len(chunk)
+                            if downloaded > self.max_file_size_bytes:
+                                raise RuntimeError(
+                                    f"File size exceeds limit: "
+                                    f"{downloaded // (1024*1024)}MB > "
+                                    f"{self.max_file_size_mb}MB"
+                                )
+                            chunks.append(chunk)
+                        content = b"".join(chunks)
+                    data = safetensors_load(content)
+                    return self._unwrap_safetensors(data)
+            except RuntimeError:
+                raise
            except Exception as e:
-                # Log actual error for debugging, return generic error to user
                logging.error(f"Failed to download or load tensor from URL: {e}")
                raise RuntimeError("Failed to load tensor")
        else:
-            # Restrict local file access to configured directory only
            try:
-                # Check if local media path is configured
                if not self.allowed_local_media_path:
                    logging.warning(
                        "Local file access attempted but no allowed path configured"
                    )
                    raise RuntimeError("Failed to load tensor")

-                # Strip file:// prefix if present
                local_path = path.removeprefix("file://")
-
                resolved_path = Path(local_path).resolve()
                allowed_path = Path(self.allowed_local_media_path).resolve()

-                # Secure path validation: Check if the resolved path is actually within allowed directory
                try:
                    resolved_path.relative_to(allowed_path)
                except ValueError:
@@ -130,17 +175,19 @@ class MultimodalRequestProcessor:
                    )
                    raise RuntimeError("Failed to load tensor")

-                # Check file size before loading
-                if resolved_path.exists():
-                    file_size = resolved_path.stat().st_size
-                    if file_size > self.max_file_size_bytes:
-                        raise RuntimeError(
-                            f"File size ({file_size // (1024*1024)}MB) exceeds "
-                            f"maximum allowed size ({self.max_file_size_bytes // (1024*1024)}MB)"
-                        )
-                return torch.load(resolved_path, map_location="cpu", weights_only=True)
+                if not resolved_path.exists():
+                    raise RuntimeError(f"Embedding file not found: {resolved_path}")
+                file_size = resolved_path.stat().st_size
+                if file_size > self.max_file_size_bytes:
+                    raise RuntimeError(
+                        f"File size ({file_size // (1024*1024)}MB) exceeds "
+                        f"maximum allowed size ({self.max_file_size_bytes // (1024*1024)}MB)"
+                    )
+                data = safetensors_load_file(str(resolved_path))
+                return self._unwrap_safetensors(data)
+            except RuntimeError:
+                raise
            except Exception as e:
-                # Log actual error for debugging, return generic error to user
                logging.error(f"Failed to load tensor from local path: {e}")
                raise RuntimeError("Failed to load tensor")

@@ -164,7 +211,7 @@ class MultimodalRequestProcessor:
                        if not url:
                            continue
                        self.modality = "image"
-                        if url.endswith((".pt", ".pth", ".bin")):
+                        if url.endswith(".safetensors"):
                            embedding_paths.append(url)
                        else:
                            image_urls.append(url)
@@ -247,7 +294,7 @@ class MultimodalRequestProcessor:
        multi_modal_data = request.get("multi_modal_data")
        if multi_modal_data and isinstance(multi_modal_data, dict):
            processed_mm_data = {}
-            loaded_embeddings = []
+            loaded_embeddings: list[torch.Tensor] = []

            # Process images and embedding paths from image_url field
            image_items = multi_modal_data.get("image_url", [])
@@ -274,8 +321,7 @@ class MultimodalRequestProcessor:
                        )
                        continue

-                    # Check if this is an embedding file based on extension
-                    if url.endswith((".pt", ".pth", ".bin")):
+                    if url.endswith(".safetensors"):
                        embedding_paths.append(url)
                    else:
                        # Keep original item format for load_image_batch
@@ -299,14 +345,25 @@ class MultimodalRequestProcessor:
                        logging.error(f"Failed to load images: {e}")
                        return None

-                # Load embedding files (.pt, .pth, .bin) for PD flow
-                # These are pre-computed vision encoder outputs
+                # Load pre-computed vision encoder embeddings (.safetensors) for PD flow
                if embedding_paths:
                    try:
-                        loaded_embeddings = [
+                        raw_loaded = [
                            self.load_tensor_from_path_or_url(path)
                            for path in embedding_paths
                        ]
+                        loaded_embeddings = []
+                        for item in raw_loaded:
+                            if isinstance(item, dict):
+                                emb = item.get("mm_embeddings")
+                                if emb is None:
+                                    logging.error(
+                                        "Dictionary embeddings missing 'mm_embeddings' key"
+                                    )
+                                    return None
+                                loaded_embeddings.append(emb)
+                            else:
+                                loaded_embeddings.append(item)
                        if loaded_embeddings:
                            logging.info(
                                f"Loaded {len(loaded_embeddings)} embedding file(s) from paths: {embedding_paths}"

--- a/components/src/dynamo/trtllm/workers/llm_worker.py
+++ b/components/src/dynamo/trtllm/workers/llm_worker.py
@@ -63,6 +63,18 @@ from dynamo.trtllm.request_handlers.handlers import (
 )
 from dynamo.trtllm.utils.trtllm_utils import deep_update

+# Optional imports for Rust frontend media decoding support
+MediaDecoder: type | None = None
+MediaFetcher: type | None = None
+try:
+    from dynamo.llm import MediaDecoder, MediaFetcher
+
+    MEDIA_DECODER_AVAILABLE = True
+except ImportError:
+    MediaDecoder = None
+    MediaFetcher = None
+    MEDIA_DECODER_AVAILABLE = False
+
 # Default buffer size for kv cache events.
 DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024

@@ -410,6 +422,7 @@ async def init_llm_worker(
            max_file_size_mb=config.max_file_size_mb,
            tokenizer=tokenizer,
            allowed_local_media_path=config.allowed_local_media_path,
+            enable_frontend_decoding=config.frontend_decoding,
        )

    else:
@@ -586,6 +599,21 @@ async def init_llm_worker(
            disagg_machine_id=int(endpoint.connection_id()) % 1021,
        )

+        media_decoder = None
+        media_fetcher = None
+        if config.frontend_decoding:
+            if not MEDIA_DECODER_AVAILABLE:
+                raise RuntimeError(
+                    "--frontend-decoding requires MediaDecoder support. "
+                    "Ensure dynamo.llm module includes MediaDecoder and MediaFetcher."
+                )
+            assert MediaDecoder is not None and MediaFetcher is not None
+            media_decoder = MediaDecoder()
+            media_decoder.enable_image({"limits": {"max_alloc": 128 * 1024 * 1024}})
+            media_fetcher = MediaFetcher()
+            media_fetcher.timeout_ms(30000)
+            media_fetcher.allow_direct_port(False)
+
        # Register the model with runtime config
        # Encode workers do NOT register - they're internal workers only
        # Prefill and decode workers register - frontend detects their role via ModelType
@@ -600,6 +628,8 @@ async def init_llm_worker(
                kv_cache_block_size=config.kv_block_size,
                runtime_config=runtime_config,
                custom_template_path=config.custom_jinja_template,
+                media_decoder=media_decoder,
+                media_fetcher=media_fetcher,
            )

        # Get health check payload (checks env var and falls back to TensorRT-LLM default)

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -685,7 +685,7 @@ async def register_vllm_model(

        media_fetcher = MediaFetcher()
        media_fetcher.timeout_ms(30000)
-        media_fetcher.allow_direct_port(True)
+        media_fetcher.allow_direct_port(False)

    await register_model(
        model_input,

--- a/docs/features/multimodal/multimodal-trtllm.md
+++ b/docs/features/multimodal/multimodal-trtllm.md
@@ -17,7 +17,7 @@ You can provide multimodal inputs in the following ways:
 | Modality | Input Format | Aggregated | Disaggregated | Notes |
 |----------|--------------|------------|---------------|-------|
 | **Image** | HTTP/HTTPS URL | Yes | Yes | Full support for all image models |
-| **Image** | Pre-computed Embeddings (.pt, .pth, .bin) | Yes | Yes | Direct embedding files |
+| **Image** | Pre-computed Embeddings (.safetensors) | Yes | Yes | Direct embedding files |
 | **Video** | HTTP/HTTPS URL | No | No | Not implemented |
 | **Audio** | HTTP/HTTPS URL | No | No | Not implemented |

@@ -26,7 +26,7 @@ You can provide multimodal inputs in the following ways:
 | Format | Example | Description |
 |--------|---------|-------------|
 | **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files |
-| **Pre-computed Embeddings** | `/path/to/embedding.pt` | Local embedding files (.pt, .pth, .bin) |
+| **Pre-computed Embeddings** | `/path/to/embedding.safetensors` | Local embedding files (.safetensors only) |

 ## Deployment Patterns

@@ -221,40 +221,24 @@ For high-performance multimodal inference, Dynamo supports pre-computed embeddin

 ### Supported File Types

- `.pt` - PyTorch tensor files
- `.pth` - PyTorch checkpoint files
- `.bin` - Binary tensor files
+- `.safetensors` - Safe tensor files ([safetensors format](https://huggingface.co/docs/safetensors))

-### Embedding File Formats
+> **Security Note:** `.pt`, `.pth`, and `.bin` files are **rejected** because they use Python pickle deserialization, which can execute arbitrary code. Only `.safetensors` format is accepted.

-TRT-LLM supports two formats for embedding files:
+### Embedding File Formats

-**1. Simple Tensor Format**
+Embedding files must use the `.safetensors` format. The first tensor key in the file is used as the embedding tensor.

-Direct tensor saved as `.pt` file containing only the embedding tensor:
+**Saving embeddings:**

 ```python
-embedding_tensor = torch.rand(1, 576, 4096)  # [batch, seq_len, hidden_dim]
-torch.save(embedding_tensor, "embedding.pt")
-```
-
-**2. Dictionary Format with Auxiliary Data**
+from safetensors.torch import save_file
+import torch

-Dictionary containing multiple keys, used by models like Llama-4 that require additional metadata:
-
-```python
-embedding_dict = {
-    "mm_embeddings": torch.rand(1, 576, 4096),
-    "special_tokens": [128256, 128257],
-    "image_token_offsets": [[0, 576]],
-    # ... other model-specific metadata
-}
-torch.save(embedding_dict, "llama4_embedding.pt")
+embedding_tensor = torch.rand(1, 576, 4096)  # [batch, seq_len, hidden_dim]
+save_file({"embedding": embedding_tensor}, "embedding.safetensors")
 ```

- **Simple tensors**: Loaded directly and passed to `mm_embeddings` parameter
- **Dictionary format**: `mm_embeddings` key extracted as main tensor, other keys preserved as auxiliary data
-
 ### How to Launch

 ```bash
@@ -264,7 +248,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
 ./launch/epd_disagg.sh
 ```

-> **Note:** This script is designed for 8-node H200 with `Llama-4-Scout-17B-16E-Instruct` model and assumes you have a model-specific embedding file ready.
+> **Note:** This script is designed for 8-node H200 with `Llama-4-Scout-17B-16E-Instruct` model and assumes you have a model-specific `.safetensors` embedding file ready.

 ### Configuration

@@ -289,7 +273,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe the image"},
-                {"type": "image_url", "image_url": {"url": "/path/to/embedding.pt"}}
+                {"type": "image_url", "image_url": {"url": "/path/to/embedding.safetensors"}}
            ]
        }
    ],
@@ -316,7 +300,7 @@ sequenceDiagram

    Client->>Frontend: POST /v1/chat/completions
    Frontend->>PrefillWorker: Route to prefill worker
-    PrefillWorker->>EncodeWorker: Send request (embedding paths)
+    PrefillWorker->>EncodeWorker: Send request (embedding .safetensors paths)
    EncodeWorker->>NIXL: Create readable operation
    EncodeWorker->>PrefillWorker: Send metadata + NIXL info
    PrefillWorker->>NIXL: Begin read operation
@@ -401,10 +385,10 @@ await register_model(

 | Transfer Stage | Message | NIXL Transfer |
 |----------------|---------|---------------|
-| **Frontend → Prefill** | Request with image URL or embedding path | No |
+| **Frontend → Prefill** | Request with image URL or .safetensors embedding path | No |
 | **Prefill → Encode (Image URL)** | Request with image URL | No |
 | **Encode → Prefill (Image URL)** | `ep_disaggregated_params` with `multimodal_embedding_handles`, processed prompt, and token IDs | No |
-| **Prefill → Encode (Embedding Path)** | Request with embedding file path | No |
+| **Prefill → Encode (Embedding Path)** | Request with .safetensors embedding file path | No |
 | **Encode → Prefill (Embedding Path)** | NIXL readable metadata + shape/dtype + auxiliary data | Yes (Embeddings tensor via RDMA) |
 | **Prefill → Decode** | `disaggregated_params` with `_epd_metadata` (prompt, token IDs) | Configurable (KV cache: NIXL default, UCX optional) |


--- a/tests/serve/launch/agg_raw_embeddings_llava.sh
+++ b/tests/serve/launch/agg_raw_embeddings_llava.sh
@@ -5,18 +5,18 @@
 # LLaVA Raw-Embeddings E/PD Test
 #
 # Phase 1 — Run HuggingFace vision encoder standalone to produce
-#            pre-computed embeddings at $EMBEDDINGS_FILE (.pt tensor).
+#            pre-computed embeddings at $EMBEDDINGS_FILE (.safetensors format).
 #
 # Phase 2 — Start Encode + Aggregated PD workers for LLaVA, then
 #            accept chat/completions requests whose image_url points
-#            to the embeddings file (file:///tmp/llava_embeddings.pt).
+#            to the embeddings file (file:///tmp/llava_embeddings.safetensors).
 #
 # Known limitation: The default revision of llava-hf/llava-v1.6-mistral-7b-hf
 # may crash with certain TRT-LLM versions.  Set MODEL_REVISION to pin a
 # safe commit (e.g. 52320fb52229).

 set -e
-trap 'echo Cleaning up...; rm -f "${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}" /tmp/_resolved_model_path.txt; kill 0' EXIT
+trap 'echo Cleaning up...; rm -f "${EMBEDDINGS_FILE:-/tmp/llava_embeddings.safetensors}" /tmp/_resolved_model_path.txt; kill 0' EXIT

 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"

@@ -37,7 +37,7 @@ export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
 export CUSTOM_TEMPLATE=${CUSTOM_TEMPLATE:-"$DYNAMO_HOME/examples/backends/trtllm/templates/llava_multimodal.jinja"}

 # Embeddings configuration
-EMBEDDINGS_FILE="${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}"
+EMBEDDINGS_FILE="${EMBEDDINGS_FILE:-/tmp/llava_embeddings.safetensors}"
 TEST_IMAGE_URL="${TEST_IMAGE_URL:-https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png}"

 # Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
@@ -71,13 +71,14 @@ CUDA_VISIBLE_DEVICES=0 python3 - <<'PYEOF'
 import torch, io, os, urllib.request
 from PIL import Image
 from huggingface_hub import snapshot_download
+from safetensors.torch import save_file as safetensors_save_file
 from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor

 model_id   = os.environ["MODEL_PATH"]
 revision   = os.environ.get("MODEL_REVISION", "") or None
 image_url  = os.environ.get("TEST_IMAGE_URL",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
-output     = os.environ.get("EMBEDDINGS_FILE", "/tmp/llava_embeddings.pt")
+output     = os.environ.get("EMBEDDINGS_FILE", "/tmp/llava_embeddings.safetensors")

 # ── Download / resolve model ──
 print(f"Resolving model {model_id} (revision={revision}) …")
@@ -125,8 +126,8 @@ with torch.no_grad():

 print(f"Embeddings: shape={embeddings.shape}, dtype={embeddings.dtype}")

-# ── Save to disk ──
-torch.save(embeddings.cpu(), output)
+# ── Save to disk as safetensors (safe format, no pickle) ──
+safetensors_save_file({"embedding": embeddings.cpu()}, output)
 print(f"Saved embeddings → {output}")

 # ── Write resolved model path so Phase 2 uses the exact same revision ──

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -341,12 +341,12 @@ trtllm_configs = {
    ),
    # LLaVA raw-embeddings E/PD test
    # Validates the raw-embeddings code path where pre-computed vision embeddings
-    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
+    # (.safetensors file) are sent via file:// URL instead of a raw image URL.
    #
    # Flow:
    #   1. Launch script generates embeddings using standalone HF vision encoder
    #   2. Encode + Aggregated PD workers start for LLaVA
-    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
+    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.safetensors
    #
    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
    # The 7B LLaVA model requires two GPUs because both encode and PD workers
@@ -372,7 +372,7 @@ trtllm_configs = {
        delayed_start=180,
        request_payloads=[
            multimodal_payload_default(
-                image_url="file:///tmp/llava_embeddings.pt",
+                image_url="file:///tmp/llava_embeddings.safetensors",
                text="Describe what this image shows.",
                expected_response=["bench", "person", "image", "picture"],
            )
@@ -440,6 +440,35 @@ trtllm_configs = {
            ),
        ],
    ),
+    # Aggregated multimodal with --frontend-decoding enabled.
+    # Verifies image URL inference works when images are decoded by the Rust
+    # MediaDecoder in the frontend instead of the Python backend.
+    "aggregated_multimodal_frontend_decoding": TRTLLMConfig(
+        name="aggregated_multimodal_frontend_decoding",
+        directory=trtllm_dir,
+        script_name="agg_multimodal.sh",
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.trtllm,
+            pytest.mark.multimodal,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(900),
+        ],
+        model="Qwen/Qwen3-VL-2B-Instruct",
+        frontend_port=DefaultPort.FRONTEND.value,
+        timeout=900,
+        delayed_start=60,
+        request_payloads=[
+            multimodal_payload_default(
+                text="Describe what you see in this image.",
+                expected_response=["mountain", "rock", "trees", "road"],
+            )
+        ],
+        env={
+            "AGG_ENGINE_ARGS": "/workspace/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml",
+            "DYN_TRTLLM_FRONTEND_DECODING": "true",
+        },
+    ),
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,