"vscode:/vscode.git/clone" did not exist on "61aedb5ffe056f83b1edab15610a644d32f40071"
Commit aa1e273a authored by zhuwenwen's avatar zhuwenwen
Browse files

update torch version and support lm_head nn layout

parent 4851c202
......@@ -190,7 +190,6 @@ set(VLLM_EXT_SRC
"csrc/opt/activation_kernels_opt.cu"
"csrc/attention/attention_kernels_opt.cu"
"csrc/opt/layernorm_kernels_opt.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
# "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu"
......
......@@ -42,9 +42,9 @@ vLLM支持
#### 编译环境准备
提供2种环境准备方式:
1. 基于光源pytorch2.1.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。
1. 基于光源pytorch2.3.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。
2. 基于现有python环境:安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
2. 基于现有python环境:安装pytorch2.3.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
```shell
pip install torch* (下载的torch的whl包)
pip install setuptools wheel
......@@ -70,9 +70,9 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
```
#### 运行基础环境准备
1、使用上面基于光源pytorch2.1.0基础镜像环境
1、使用上面基于光源pytorch2.3.0基础镜像环境
2、根据pytorch2.1.0、python、dtk及系统下载对应的依赖包:
2、根据pytorch2.3.0、python、dtk及系统下载对应的依赖包:
- triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
- xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
......
......@@ -10,8 +10,8 @@ peft
pytest-asyncio
tensorizer>=2.9.0
torch == 2.1.0
torch == 2.3.0
triton == 2.1.0
flash_attn == 2.0.4
flash_attn == 2.6.1
xformers == 0.0.25
lmslim == 0.1.0
\ No newline at end of file
......@@ -12,12 +12,16 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
from vllm.model_executor.parameter import BasevLLMParameter
from vllm.model_executor.utils import set_weight_attrs
import os
DEFAULT_VOCAB_PADDING_SIZE = 64
class UnquantizedEmbeddingMethod(QuantizeMethodBase):
"""Unquantized method for embeddings."""
def __init__(self):
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
......@@ -37,7 +41,16 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
return F.linear(x, layer.weight, bias)
if self.use_llama_nn:
if bias is not None:
if len(x.shape) == 2:
return torch.addmm(bias, x, layer.weight)
else:
return torch.matmul(x, layer.weight) + bias
else:
return torch.matmul(x, layer.weight)
else:
return F.linear(x, layer.weight, bias)
def embedding(self, layer: torch.nn.Module,
input_: torch.Tensor) -> torch.Tensor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment