"tests/vscode:/vscode.git/clone" did not exist on "58117664ed5ceb02cbce73f3ee3119728721c37c"
Unverified Commit 8c29042b authored by arlo's avatar arlo Committed by GitHub
Browse files

[Feature] Add InstantTensor weight loader (#36139)

parent 5467d137
......@@ -31,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
......@@ -154,7 +154,7 @@ WORKDIR /vllm-workspace
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl make clangd-14
apt-get install -y --no-install-recommends vim numactl clangd-14
RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
......
# Loading Model Weights with InstantTensor
InstantTensor accelerates loading Safetensors weights on CUDA devices through distributed loading, pipelined prefetching, and direct I/O. InstantTensor also supports GDS (GPUDirect Storage) when available.
For more details, see the [InstantTensor GitHub repository](https://github.com/scitix/InstantTensor).
## Installation
```bash
pip install instanttensor
```
## Use InstantTensor in vLLM
Add `--load-format instanttensor` as a command-line argument.
For example:
```bash
vllm serve Qwen/Qwen2.5-0.5B --load-format instanttensor
```
## Benchmarks
| Model | GPU | Backend | Load Time (s) | Throughput (GB/s) | Speedup |
| --- | ---: | --- | ---: | ---: | --- |
| Qwen3-30B-A3B | 1*H200 | Safetensors | 57.4 | 1.1 | 1x |
| Qwen3-30B-A3B | 1*H200 | InstantTensor | 1.77 | 35 | <span style="color: green">**32.4x**</span> |
| DeepSeek-R1 | 8*H200 | Safetensors | 160 | 4.3 | 1x |
| DeepSeek-R1 | 8*H200 | InstantTensor | 15.3 | 45 | <span style="color: green">**10.5x**</span> |
For the full benchmark results, see <https://github.com/scitix/InstantTensor/blob/main/docs/benchmark.md>.
......@@ -44,4 +44,5 @@ numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.2.2
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
......@@ -57,6 +57,7 @@ numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
terratorch >= 1.2.2 # Required for Prithvi tests
......
......@@ -375,6 +375,8 @@ inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
instanttensor==0.1.5
# via -r requirements/test.in
isoduration==20.11.0
# via jsonschema
isort==5.13.2
......@@ -1169,6 +1171,7 @@ torch==2.10.0+cu129
# accelerate
# bitsandbytes
# encodec
# instanttensor
# kornia
# lightly
# lightning
......
......@@ -969,6 +969,7 @@ setup(
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [
"librosa",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import SamplingParams
from vllm.platforms import current_platform
test_model = "openai-community/gpt2"
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="InstantTensor requires NVIDIA GPUs",
)
def test_model_loader_download_files(vllm_runner):
with vllm_runner(test_model, load_format="instanttensor") as llm:
deserialized_outputs = llm.generate(prompts, sampling_params)
assert deserialized_outputs
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import tempfile
import huggingface_hub.constants
import pytest
import torch
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf,
instanttensor_weights_iterator,
safetensors_weights_iterator,
)
from vllm.platforms import current_platform
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="InstantTensor requires NVIDIA GPUs",
)
def test_instanttensor_model_loader():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf(
"openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
assert len(safetensors) > 0
instanttensor_tensors = {}
hf_safetensors_tensors = {}
for name, tensor in instanttensor_weights_iterator(safetensors, True):
# Copy the tensor immediately as it is a reference to the internal
# buffer of instanttensor.
instanttensor_tensors[name] = tensor.to("cpu")
for name, tensor in safetensors_weights_iterator(safetensors, True):
hf_safetensors_tensors[name] = tensor
assert len(instanttensor_tensors) == len(hf_safetensors_tensors)
for name, instanttensor_tensor in instanttensor_tensors.items():
assert instanttensor_tensor.dtype == hf_safetensors_tensors[name].dtype
assert instanttensor_tensor.shape == hf_safetensors_tensors[name].shape
assert torch.all(instanttensor_tensor.eq(hf_safetensors_tensors[name]))
if __name__ == "__main__":
test_instanttensor_model_loader()
......@@ -29,6 +29,9 @@ class LoadConfig:
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
- "instanttensor" will load the Safetensors weights on CUDA devices using
InstantTensor, which enables distributed loading with pipelined prefetching
and fast direct I/O.\n
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
- "dummy" will initialize the weights with random values, which is mainly
......@@ -46,7 +49,7 @@ class LoadConfig:
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
Mistral models.\n
- Other custom values can be supported via plugins."""
download_dir: str | None = None
"""Directory to download and load the weights, default to the default
......
......@@ -35,6 +35,7 @@ LoadFormats = Literal[
"dummy",
"fastsafetensors",
"gguf",
"instanttensor",
"mistral",
"npcache",
"pt",
......@@ -51,6 +52,7 @@ _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = {
"dummy": DummyModelLoader,
"fastsafetensors": DefaultModelLoader,
"gguf": GGUFModelLoader,
"instanttensor": DefaultModelLoader,
"mistral": DefaultModelLoader,
"npcache": DefaultModelLoader,
"pt": DefaultModelLoader,
......
......@@ -23,6 +23,7 @@ from vllm.model_executor.model_loader.weight_utils import (
filter_duplicate_safetensors_files,
filter_files_not_needed_for_inference,
get_quant_config,
instanttensor_weights_iterator,
maybe_download_from_modelscope,
multi_thread_pt_weights_iterator,
multi_thread_safetensors_weights_iterator,
......@@ -121,7 +122,11 @@ class DefaultModelLoader(BaseModelLoader):
# Some quantized models use .pt files for storing the weights.
if load_format == "hf":
allow_patterns = ["*.safetensors", "*.bin"]
elif load_format == "safetensors" or load_format == "fastsafetensors":
elif (
load_format == "safetensors"
or load_format == "fastsafetensors"
or load_format == "instanttensor"
):
use_safetensors = True
allow_patterns = ["*.safetensors"]
elif load_format == "mistral":
......@@ -219,6 +224,11 @@ class DefaultModelLoader(BaseModelLoader):
hf_weights_files,
self.load_config.use_tqdm_on_load,
)
elif self.load_config.load_format == "instanttensor":
weights_iterator = instanttensor_weights_iterator(
hf_weights_files,
self.load_config.use_tqdm_on_load,
)
else:
if extra_config.get("enable_multithread_load"):
weights_iterator = multi_thread_safetensors_weights_iterator(
......
......@@ -29,7 +29,7 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
from vllm import envs
from vllm.config import ModelConfig
from vllm.config.load import LoadConfig
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.distributed import get_tensor_model_parallel_rank, get_world_group
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (
QuantizationConfig,
......@@ -909,6 +909,46 @@ def fastsafetensors_weights_iterator(
loader.close()
def instanttensor_weights_iterator(
hf_weights_files: list[str],
use_tqdm_on_load: bool,
) -> Generator[tuple[str, torch.Tensor], None, None]:
"""Iterate over the weights in the model safetensor files
using instanttensor library."""
try:
import instanttensor
except ImportError as e:
raise ImportError(
"Please install instanttensor via `pip install instanttensor`"
) from e
if not current_platform.is_cuda():
raise ValueError("InstantTensor requires NVIDIA GPUs")
try:
world_group = get_world_group()
except AssertionError:
# Entering here only in unit tests where the world group is not initialized.
process_group = None
else:
process_group = world_group.device_group if world_group.world_size > 1 else None
device = current_platform.current_device()
with instanttensor.safe_open(
hf_weights_files, framework="pt", device=device, process_group=process_group
) as f:
yield from tqdm(
f.tensors(),
desc="Loading safetensors using InstantTensor loader",
disable=not enable_tqdm(use_tqdm_on_load),
bar_format=_BAR_FORMAT,
position=tqdm._get_free_pos(),
total=len(f.keys()),
mininterval=1.0,
)
def pt_weights_iterator(
hf_weights_files: list[str],
use_tqdm_on_load: bool,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment