"vllm/vscode:/vscode.git/clone" did not exist on "5daffe7cf6db9765bd667d1a2cf5f18843d58fc7"
Commit aa1e273a authored by zhuwenwen's avatar zhuwenwen
Browse files

update torch version and support lm_head nn layout

parent 4851c202
...@@ -190,7 +190,6 @@ set(VLLM_EXT_SRC ...@@ -190,7 +190,6 @@ set(VLLM_EXT_SRC
"csrc/opt/activation_kernels_opt.cu" "csrc/opt/activation_kernels_opt.cu"
"csrc/attention/attention_kernels_opt.cu" "csrc/attention/attention_kernels_opt.cu"
"csrc/opt/layernorm_kernels_opt.cu" "csrc/opt/layernorm_kernels_opt.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
# "csrc/quantization/gptq/q_gemm.cu" # "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu" # "csrc/quantization/fp8/common.cu"
......
...@@ -42,9 +42,9 @@ vLLM支持 ...@@ -42,9 +42,9 @@ vLLM支持
#### 编译环境准备 #### 编译环境准备
提供2种环境准备方式: 提供2种环境准备方式:
1. 基于光源pytorch2.1.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。 1. 基于光源pytorch2.3.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。
2. 基于现有python环境:安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下: 2. 基于现有python环境:安装pytorch2.3.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
```shell ```shell
pip install torch* (下载的torch的whl包) pip install torch* (下载的torch的whl包)
pip install setuptools wheel pip install setuptools wheel
...@@ -70,9 +70,9 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install ...@@ -70,9 +70,9 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
``` ```
#### 运行基础环境准备 #### 运行基础环境准备
1、使用上面基于光源pytorch2.1.0基础镜像环境 1、使用上面基于光源pytorch2.3.0基础镜像环境
2、根据pytorch2.1.0、python、dtk及系统下载对应的依赖包: 2、根据pytorch2.3.0、python、dtk及系统下载对应的依赖包:
- triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/) - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
- xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers) - xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn) - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
......
...@@ -10,8 +10,8 @@ peft ...@@ -10,8 +10,8 @@ peft
pytest-asyncio pytest-asyncio
tensorizer>=2.9.0 tensorizer>=2.9.0
torch == 2.1.0 torch == 2.3.0
triton == 2.1.0 triton == 2.1.0
flash_attn == 2.0.4 flash_attn == 2.6.1
xformers == 0.0.25 xformers == 0.0.25
lmslim == 0.1.0 lmslim == 0.1.0
\ No newline at end of file
...@@ -12,6 +12,7 @@ from vllm.model_executor.layers.quantization.base_config import ( ...@@ -12,6 +12,7 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding) QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
from vllm.model_executor.parameter import BasevLLMParameter from vllm.model_executor.parameter import BasevLLMParameter
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
import os
DEFAULT_VOCAB_PADDING_SIZE = 64 DEFAULT_VOCAB_PADDING_SIZE = 64
...@@ -19,6 +20,9 @@ DEFAULT_VOCAB_PADDING_SIZE = 64 ...@@ -19,6 +20,9 @@ DEFAULT_VOCAB_PADDING_SIZE = 64
class UnquantizedEmbeddingMethod(QuantizeMethodBase): class UnquantizedEmbeddingMethod(QuantizeMethodBase):
"""Unquantized method for embeddings.""" """Unquantized method for embeddings."""
def __init__(self):
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
def create_weights(self, layer: torch.nn.Module, def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int, input_size_per_partition: int,
output_partition_sizes: List[int], input_size: int, output_partition_sizes: List[int], input_size: int,
...@@ -37,6 +41,15 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): ...@@ -37,6 +41,15 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
layer: torch.nn.Module, layer: torch.nn.Module,
x: torch.Tensor, x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor: bias: Optional[torch.Tensor] = None) -> torch.Tensor:
if self.use_llama_nn:
if bias is not None:
if len(x.shape) == 2:
return torch.addmm(bias, x, layer.weight)
else:
return torch.matmul(x, layer.weight) + bias
else:
return torch.matmul(x, layer.weight)
else:
return F.linear(x, layer.weight, bias) return F.linear(x, layer.weight, bias)
def embedding(self, layer: torch.nn.Module, def embedding(self, layer: torch.nn.Module,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment