fix: small bug for llama-405b fp16 (#733)

252e0f7b · Ying Sheng · GitHub · 7f6f2f0f · 252e0f7b · 252e0f7b
Unverified Commit 252e0f7b authored Jul 25, 2024 by Ying Sheng Committed by GitHub Jul 25, 2024
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

python/sglang/srt/managers/controller/model_runner.py python/sglang/srt/managers/controller/model_runner.py +1 -1

python/sglang/srt/utils.py python/sglang/srt/utils.py +1 -0

No files found.
--- a/python/sglang/srt/managers/controller/model_runner.py
+++ b/python/sglang/srt/managers/controller/model_runner.py
@@ -121,7 +121,7 @@ class ModelRunner:
            skip_tokenizer_init=True,
        )

-        if is_llama3_405b_fp8(self.model_config):
+        if is_llama3_405b_fp8(self.model_config) and self.tp_size <= 8:
            # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
            self.model_config.hf_config.num_key_value_heads = 8
            vllm_model_config.hf_config.num_key_value_heads = 8

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -626,6 +626,7 @@ def is_llama3_405b_fp8(model_config):
        and model_config.hf_config.intermediate_size == 53248
        and model_config.hf_config.num_hidden_layers == 126
        and model_config.hf_config.num_key_value_heads == 16
+        and hasattr(model_config.hf_config, "quantization_config")
        and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
    ):
        return True