add LM_TN for bloom lm_head weight

2a9c497e · zhuwenwen · 85e8224c · 2a9c497e · 2a9c497e
Commit 2a9c497e authored Oct 16, 2024 by zhuwenwen
Showing with 7 additions and 1 deletion

vllm/model_executor/layers/vocab_parallel_embedding.py vllm/model_executor/layers/vocab_parallel_embedding.py +2 -1

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +5 -0

No files found.
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -22,6 +22,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
    def __init__(self):
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
+        self.use_lm_tn = os.environ.get('LM_TN') == '1'
    def create_weights(self, layer: torch.nn.Module,
                       input_size_per_partition: int,
@@ -41,7 +42,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if self.use_llama_nn:
+        if self.use_llama_nn and not self.use_lm_tn:
            if bias is not None:
                if len(x.shape) == 2: 
                    return torch.addmm(bias, x, layer.weight)

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -30,6 +30,10 @@ def get_model_architecture(
                os.environ['LLAMA_NN'] = '0'
             else:
                os.environ['LLAMA_NN'] = '1'
+        if architectures == ['BloomForCausalLM']:
+            os.environ['LM_TN'] = '1'
+        else:
+            os.environ['LM_TN'] = '0'
        if os.getenv('GEMM_PAD') != '1': 
            os.environ['GEMM_PAD'] = '0'
        if os.getenv('FA_PAD') != '1': 
@@ -46,6 +50,7 @@ def get_model_architecture(
                os.environ['AWQ_PAD'] = '0'
    else:
        os.environ['LLAMA_NN'] = '0'
+        os.environ['LM_TN'] = '0'
        os.environ['GEMM_PAD'] = '0'
        os.environ['FA_PAD'] = '0'
        os.environ['AWQ_PAD'] = '0'