Fix ShardedStateLoader for vllm fp8 quantization (#7708)

4f419c00 · Flex Wang · GitHub · a3fce56b · 4f419c00
Unverified Commit 4f419c00 authored Aug 22, 2024 by Flex Wang Committed by GitHub Aug 22, 2024
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

vllm/model_executor/model_loader/loader.py vllm/model_executor/model_loader/loader.py +4 -0

No files found.
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -579,6 +579,10 @@ class ShardedStateLoader(BaseModelLoader):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
                                          lora_config, cache_config)
+                for _, module in model.named_modules():
+                    quant_method = getattr(module, "quant_method", None)
+                    if quant_method is not None:
+                        quant_method.process_weights_after_loading(module)
            rank = get_tensor_model_parallel_rank()
            pattern = os.path.join(
                local_model_path,