Fix GGuf and add back test_gguf.py (#7067)

2a5f0100 · Baizhou Zhang · GitHub · dbdf76ca · 2a5f0100 · 2a5f0100
Unverified Commit 2a5f0100 authored Jun 10, 2025 by Baizhou Zhang Committed by GitHub Jun 10, 2025
Showing with 9 additions and 6 deletions

python/sglang/srt/layers/linear.py python/sglang/srt/layers/linear.py +0 -4

python/sglang/srt/model_loader/loader.py python/sglang/srt/model_loader/loader.py +8 -1

test/srt/run_suite.py test/srt/run_suite.py +1 -1

No files found.
--- a/python/sglang/srt/layers/linear.py
+++ b/python/sglang/srt/layers/linear.py
@@ -546,8 +546,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            param.shard_id.append(loaded_shard_id)
            param.shard_id_map[loaded_shard_id] = len(param.data_container)
            param.data_container.append(loaded_weight)
-            if len(param.data_container) == 2:
-                self.qweight = param.materialize_nested()
            return
        param_data = param.data
@@ -961,8 +959,6 @@ class QKVParallelLinear(ColumnParallelLinear):
            param.shard_id.append(loaded_shard_id)
            param.shard_id_map[loaded_shard_id] = len(param.data_container)
            param.data_container.append(loaded_weight)
-            if len(param.data_container) == 3:
-                self.qweight = param.materialize_nested()
            return
        param_data = param.data

--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -1259,12 +1259,19 @@ class GGUFModelLoader(BaseModelLoader):
        ):
            model_config.hf_config.update({"tie_word_embeddings": True})
+        target_device = torch.device(device_config.device)
        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                model = _initialize_model(model_config, self.load_config)
            model.load_weights(
                self._get_weights_iterator(local_model_path, gguf_weights_map)
            )
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
        return model

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -186,7 +186,7 @@ suites = {
    "vllm_dependency_test": [
        TestFile("test_awq.py"),
        TestFile("test_bnb.py"),
-        # TestFile("test_gguf.py", 78), # TODO: Fix GGuf after updating to torch 2.7 and vllm 0.9
+        TestFile("test_gguf.py", 78),
        TestFile("test_gptqmodel_dynamic.py", 72),
        TestFile("test_vllm_dependency.py"),
    ],