Init model on GPU to reduce CPU memory footprint (#1796)

a8b150c5 · ljss · GitHub · 665cbcec · a8b150c5
Unverified Commit a8b150c5 authored Nov 28, 2023 by ljss Committed by GitHub Nov 27, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 3 deletions

vllm/model_executor/model_loader.py vllm/model_executor/model_loader.py +2 -3

No files found.
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -87,9 +87,9 @@ def get_model(model_config: ModelConfig) -> nn.Module:
    with _set_default_torch_dtype(model_config.dtype):
        # Create a model instance.
        # The weights will be initialized as empty tensors.
-        model = model_class(model_config.hf_config, linear_method)
+        with torch.device("cuda"):
+            model = model_class(model_config.hf_config, linear_method)
        if model_config.load_format == "dummy":
-            model = model.cuda()
            # NOTE(woosuk): For accurate performance evaluation, we assign
            # random values to the weights.
            initialize_dummy_weights(model)
@@ -97,5 +97,4 @@ def get_model(model_config: ModelConfig) -> nn.Module:
            # Load the weights from the cached or downloaded files.
            model.load_weights(model_config.model, model_config.download_dir,
                               model_config.load_format, model_config.revision)
-            model = model.cuda()
    return model.eval()