update torch version and support lm_head nn layout

aa1e273a · zhuwenwen · 4851c202 · aa1e273a · aa1e273a · aa1e273a
Commit aa1e273a authored Sep 13, 2024 by zhuwenwen
4 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,7 +190,6 @@ set(VLLM_EXT_SRC
  "csrc/opt/activation_kernels_opt.cu"
  "csrc/attention/attention_kernels_opt.cu"
  "csrc/opt/layernorm_kernels_opt.cu"
-  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  # "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  # "csrc/quantization/fp8/common.cu"

--- a/README.md
+++ b/README.md
@@ -42,9 +42,9 @@ vLLM支持
 #### 编译环境准备
 提供2种环境准备方式：

-1. 基于光源pytorch2.1.0基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。
+1. 基于光源pytorch2.3.0基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。

-2. 基于现有python环境：安装pytorch2.1.0，pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch)，根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下：
+2. 基于现有python环境：安装pytorch2.3.0，pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch)，根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下：
 ```shell
 pip install torch* (下载的torch的whl包)
 pip install setuptools wheel
@@ -70,9 +70,9 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
 ```

 #### 运行基础环境准备
-1、使用上面基于光源pytorch2.1.0基础镜像环境
+1、使用上面基于光源pytorch2.3.0基础镜像环境

-2、根据pytorch2.1.0、python、dtk及系统下载对应的依赖包：
+2、根据pytorch2.3.0、python、dtk及系统下载对应的依赖包：
 - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
 - xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
 - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)

--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -10,8 +10,8 @@ peft
 pytest-asyncio
 tensorizer>=2.9.0

-torch == 2.1.0
+torch == 2.3.0
 triton == 2.1.0
-flash_attn == 2.0.4
+flash_attn == 2.6.1
 xformers == 0.0.25
 lmslim == 0.1.0
\ No newline at end of file
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -12,12 +12,16 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
 from vllm.model_executor.parameter import BasevLLMParameter
 from vllm.model_executor.utils import set_weight_attrs
+import os

 DEFAULT_VOCAB_PADDING_SIZE = 64


 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
    """Unquantized method for embeddings."""
+    
+    def __init__(self):
+        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'

    def create_weights(self, layer: torch.nn.Module,
                       input_size_per_partition: int,
@@ -37,7 +41,16 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return F.linear(x, layer.weight, bias)
+        if self.use_llama_nn:
+            if bias is not None:
+                if len(x.shape) == 2: 
+                    return torch.addmm(bias, x, layer.weight)
+                else:
+                    return torch.matmul(x, layer.weight) + bias
+            else:
+                return torch.matmul(x, layer.weight)
+        else:
+            return F.linear(x, layer.weight, bias)

    def embedding(self, layer: torch.nn.Module,
                  input_: torch.Tensor) -> torch.Tensor: