[Feature] Add InstantTensor weight loader (#36139)

8c29042b · arlo · GitHub · 5467d137 · 8c29042b · 8c29042b
Unverified Commit 8c29042b authored Mar 15, 2026 by arlo Committed by GitHub Mar 14, 2026
13 changed files
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -31,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
-    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \
+    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh

@@ -154,7 +154,7 @@ WORKDIR /vllm-workspace

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl make clangd-14
+    apt-get install -y --no-install-recommends vim numactl clangd-14

 RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd


--- a/docs/models/extensions/instanttensor.md
+++ b/docs/models/extensions/instanttensor.md
+# Loading Model Weights with InstantTensor
+
+InstantTensor accelerates loading Safetensors weights on CUDA devices through distributed loading, pipelined prefetching, and direct I/O. InstantTensor also supports GDS (GPUDirect Storage) when available.
+For more details, see the [InstantTensor GitHub repository](https://github.com/scitix/InstantTensor).
+
+## Installation
+
+```bash
+pip install instanttensor
+```
+
+## Use InstantTensor in vLLM
+
+Add `--load-format instanttensor` as a command-line argument.
+
+For example:
+
+```bash
+vllm serve Qwen/Qwen2.5-0.5B --load-format instanttensor
+```
+
+## Benchmarks
+
+| Model | GPU | Backend | Load Time (s) | Throughput (GB/s) | Speedup |
+| --- | ---: | --- | ---: | ---: | --- |
+| Qwen3-30B-A3B | 1*H200 | Safetensors | 57.4 | 1.1 | 1x |
+| Qwen3-30B-A3B | 1*H200 | InstantTensor | 1.77 | 35 | <span style="color: green">**32.4x**</span> |
+| DeepSeek-R1 | 8*H200 | Safetensors | 160 | 4.3 | 1x |
+| DeepSeek-R1 | 8*H200 | InstantTensor | 15.3 | 45 | <span style="color: green">**10.5x**</span> |
+
+For the full benchmark results, see <https://github.com/scitix/InstantTensor/blob/main/docs/benchmark.md>.
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -44,4 +44,5 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.2.2
+instanttensor>=0.1.5
 pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -57,6 +57,7 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
+instanttensor>=0.1.5
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch >= 1.2.2 # Required for Prithvi tests

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -375,6 +375,8 @@ inflect==5.6.2
    # via datamodel-code-generator
 iniconfig==2.0.0
    # via pytest
+instanttensor==0.1.5
+    # via -r requirements/test.in
 isoduration==20.11.0
    # via jsonschema
 isort==5.13.2
@@ -1169,6 +1171,7 @@ torch==2.10.0+cu129
    #   accelerate
    #   bitsandbytes
    #   encodec
+    #   instanttensor
    #   kornia
    #   lightly
    #   lightning

--- a/setup.py
+++ b/setup.py
@@ -969,6 +969,7 @@ setup(
        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.2.2"],
+        "instanttensor": ["instanttensor >= 0.1.5"],
        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
        "audio": [
            "librosa",

--- a/tests/model_executor/model_loader/instanttensor_loader/__init__.py
+++ b/tests/model_executor/model_loader/instanttensor_loader/__init__.py
--- a/tests/model_executor/model_loader/instanttensor_loader/test_instanttensor_loader.py
+++ b/tests/model_executor/model_loader/instanttensor_loader/test_instanttensor_loader.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.platforms import current_platform
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="InstantTensor requires NVIDIA GPUs",
+)
+def test_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model, load_format="instanttensor") as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
--- a/tests/model_executor/model_loader/instanttensor_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/instanttensor_loader/test_weight_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import pytest
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf,
+    instanttensor_weights_iterator,
+    safetensors_weights_iterator,
+)
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="InstantTensor requires NVIDIA GPUs",
+)
+def test_instanttensor_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf(
+            "openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
+        )
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        instanttensor_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in instanttensor_weights_iterator(safetensors, True):
+            # Copy the tensor immediately as it is a reference to the internal
+            # buffer of instanttensor.
+            instanttensor_tensors[name] = tensor.to("cpu")
+
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(instanttensor_tensors) == len(hf_safetensors_tensors)
+
+        for name, instanttensor_tensor in instanttensor_tensors.items():
+            assert instanttensor_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert instanttensor_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(instanttensor_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_instanttensor_model_loader()
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -29,6 +29,9 @@ class LoadConfig:
    back to the pytorch bin format if safetensors format is not available.\n
    - "pt" will load the weights in the pytorch bin format.\n
    - "safetensors" will load the weights in the safetensors format.\n
+    - "instanttensor" will load the Safetensors weights on CUDA devices using
+    InstantTensor, which enables distributed loading with pipelined prefetching
+    and fast direct I/O.\n
    - "npcache" will load the weights in pytorch format and store a numpy cache
    to speed up the loading.\n
    - "dummy" will initialize the weights with random values, which is mainly
@@ -46,7 +49,7 @@ class LoadConfig:
    - "gguf" will load weights from GGUF format files (details specified in
    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
    - "mistral" will load weights from consolidated safetensors files used by
-    Mistral models.
+    Mistral models.\n
    - Other custom values can be supported via plugins."""
    download_dir: str | None = None
    """Directory to download and load the weights, default to the default

--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -35,6 +35,7 @@ LoadFormats = Literal[
    "dummy",
    "fastsafetensors",
    "gguf",
+    "instanttensor",
    "mistral",
    "npcache",
    "pt",
@@ -51,6 +52,7 @@ _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = {
    "dummy": DummyModelLoader,
    "fastsafetensors": DefaultModelLoader,
    "gguf": GGUFModelLoader,
+    "instanttensor": DefaultModelLoader,
    "mistral": DefaultModelLoader,
    "npcache": DefaultModelLoader,
    "pt": DefaultModelLoader,

--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -23,6 +23,7 @@ from vllm.model_executor.model_loader.weight_utils import (
    filter_duplicate_safetensors_files,
    filter_files_not_needed_for_inference,
    get_quant_config,
+    instanttensor_weights_iterator,
    maybe_download_from_modelscope,
    multi_thread_pt_weights_iterator,
    multi_thread_safetensors_weights_iterator,
@@ -121,7 +122,11 @@ class DefaultModelLoader(BaseModelLoader):
        # Some quantized models use .pt files for storing the weights.
        if load_format == "hf":
            allow_patterns = ["*.safetensors", "*.bin"]
-        elif load_format == "safetensors" or load_format == "fastsafetensors":
+        elif (
+            load_format == "safetensors"
+            or load_format == "fastsafetensors"
+            or load_format == "instanttensor"
+        ):
            use_safetensors = True
            allow_patterns = ["*.safetensors"]
        elif load_format == "mistral":
@@ -219,6 +224,11 @@ class DefaultModelLoader(BaseModelLoader):
                    hf_weights_files,
                    self.load_config.use_tqdm_on_load,
                )
+            elif self.load_config.load_format == "instanttensor":
+                weights_iterator = instanttensor_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                )
            else:
                if extra_config.get("enable_multithread_load"):
                    weights_iterator = multi_thread_safetensors_weights_iterator(

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -29,7 +29,7 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from vllm import envs
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
-from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.distributed import get_tensor_model_parallel_rank, get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (
    QuantizationConfig,
@@ -909,6 +909,46 @@ def fastsafetensors_weights_iterator(
            loader.close()


+def instanttensor_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files
+    using instanttensor library."""
+    try:
+        import instanttensor
+    except ImportError as e:
+        raise ImportError(
+            "Please install instanttensor via `pip install instanttensor`"
+        ) from e
+
+    if not current_platform.is_cuda():
+        raise ValueError("InstantTensor requires NVIDIA GPUs")
+
+    try:
+        world_group = get_world_group()
+    except AssertionError:
+        # Entering here only in unit tests where the world group is not initialized.
+        process_group = None
+    else:
+        process_group = world_group.device_group if world_group.world_size > 1 else None
+
+    device = current_platform.current_device()
+
+    with instanttensor.safe_open(
+        hf_weights_files, framework="pt", device=device, process_group=process_group
+    ) as f:
+        yield from tqdm(
+            f.tensors(),
+            desc="Loading safetensors using InstantTensor loader",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+            position=tqdm._get_free_pos(),
+            total=len(f.keys()),
+            mininterval=1.0,
+        )
+
+
 def pt_weights_iterator(
    hf_weights_files: list[str],
    use_tqdm_on_load: bool,