增加lm_nn的量化控控控制,控制为tn

41d0696e · gaoqiong · 8c61b2ed · 41d0696e · 41d0696e · 41d0696e
Commit 41d0696e authored Feb 18, 2025 by gaoqiong
4 changed files
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -360,6 +360,7 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
            loaded_params.add(name)
        #当为triton支持推理的时候不能进行处理
        if self.quant_method == "compressed_tensors":
+            os.environ['LM_NN'] = '0'
            lay_key_words = [
                "attention.query_key_value.weight",
                "attention.dense.weight",

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -550,6 +550,7 @@ class LlamaModel(nn.Module):
        #当为triton支持推理的时候不能进行处理
        if self.quant_method == "compressed_tensors":
+            os.environ['LM_NN'] = '0'
            lay_key_words = [
                "self_attn.qkv_proj.weight",
                "self_attn.o_proj.weight",

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1174,6 +1174,7 @@ class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
        if self.quant_method == "compressed_tensors":
+            os.environ['LM_NN'] = '0'
            lay_key_words = [
                "attn.c_attn.weight",
                "attn.c_proj.weight",

--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -527,6 +527,7 @@ class Qwen2Model(nn.Module):
                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
        if self.quant_method == "compressed_tensors":
+            os.environ['LM_NN'] = '0'
            lay_key_words = [
                "self_attn.qkv_proj.weight",
                "self_attn.o_proj.weight",