[Misc] add process_weights_after_loading for DummyLoader (#8969)

82f3937e · Divakar Verma · GitHub · 7da24875 · 82f3937e
Unverified Commit 82f3937e authored Sep 30, 2024 by Divakar Verma Committed by GitHub Oct 01, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 0 deletions

vllm/model_executor/model_loader/loader.py vllm/model_executor/model_loader/loader.py +12 -0

No files found.
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -441,6 +441,18 @@ class DummyModelLoader(BaseModelLoader):
            # NOTE(woosuk): For accurate performance evaluation, we assign
            # random values to the weights.
            initialize_dummy_weights(model)
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    # When quant methods need to process weights after loading
+                    # (for repacking, quantizing, etc), they expect parameters
+                    # to be on the global target device. This scope is for the
+                    # case where cpu offloading is used, where we will move the
+                    # parameters onto device for processing and back off after.
+                    with device_loading_context(
+                            module, torch.device(device_config.device)):
+                        quant_method.process_weights_after_loading(module)
        return model.eval()