Commit 09a80a64 authored by gaoqiong's avatar gaoqiong
Browse files

增加awq相关so 以及更新so存放位置,取消awq_sugon文件夹,相关函数合入kernels/gemm_s_f16

parent 617e86ea
......@@ -69,20 +69,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
file.writelines(lines)
file.close()
def copy_ck_so():
lmdeploy_root = os.path.dirname(os.path.abspath(__file__))
so_path = os.path.join(os.path.join(lmdeploy_root, "3rdparty"), "libgemm_multiB_int4.so")
# dtk version
if os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_so_path = os.path.join(rocm_path, 'lib')
print("rocm_so_path:",rocm_so_path)
shutil.copy(so_path, rocm_so_path)
else:
shutil.copy(so_path, "usr/local/lib")
def get_version():
copy_ck_so()
get_version_add()
version_file = 'lmdeploy/version.py'
with open(version_file, encoding='utf-8') as f:
......@@ -197,9 +184,24 @@ def parse_requirements(fname='requirements.txt', with_version=True):
packages += cuda_pkgs
return packages
def copy_ck_so():
lmdeploy_root = os.path.dirname(os.path.abspath(__file__))
so_path = os.path.join(os.path.join(lmdeploy_root, "3rdparty","composable_kernel"), "libgemm_multiB_int4.so")
# dtk version
target_path=os.path.join(lmdeploy_root, "lmdeploy","lib")
if os.path.exists(target_path):
shutil.copy(so_path, target_path)
elif os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_so_path = os.path.join(rocm_path, 'lib')
print("rocm_so_path:",rocm_so_path)
shutil.copy(so_path, rocm_so_path)
else:
shutil.copy(so_path, "usr/local/lib")
if __name__ == '__main__':
lmdeploy_package_data = ['lmdeploy/bin/llama_gemm']
copy_ck_so()
setup(
name='lmdeploy',
version=get_version(),
......
......@@ -2,7 +2,7 @@
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
add_library(gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu ../../models/llama/awq_sugon/gemm_w4_dequation.cu)
add_library(gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu)
target_compile_options(gemm_s4_f16 PRIVATE
--generate-line-info -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr)
set_property(TARGET gemm_s4_f16 PROPERTY POSITION_INDEPENDENT_CODE ON)
......
// Copyright (c) OpenMMLab. All rights reserved.
#include "common.h"
#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
#include <iostream>
#define BLOCKSIZE 256
namespace turbomind {
......@@ -245,4 +245,40 @@ void addFusedSiluActivation(cudaStream_t stream,half* output, const half* src,in
return;
}
}
template <typename T>
__global__ void input_padding_kernel(int num_kernels,T* output,const T* input,int m,int k,int group_size,int count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= num_kernels) return;
int j=id%(k+count*group_size);
int i=id/(k+count*group_size);
if(j<k)
{
output[i*(k+count*group_size)+j]=input[i*(k)+j];
}
else
{
output[i*(k+count*group_size)+j]=0.f;
}
}
template <typename T>
void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int num_kernels=m*(k+pad_groupcount*group_size);
input_padding_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels, output,input,m,k,group_size,pad_groupcount);
}
#define INSTANTIATEINPUTPADING(T) \
template void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
INSTANTIATEINPUTPADING(__half)
} // namespace turbomind
......@@ -9,6 +9,22 @@
#include <memory>
#include <vector>
typedef struct ihipStream_t* hipStream_t;
extern void run_weight_only_gemm(const void *A,
const void *B0,
const void *B1,
void *C,
int M,
int N,
int K,
int StrideA,
int StrideB,
int StrideB_padded,
int StrideC,
int Group,
void* splitK_padA_workspace,
int splitK_padA_workspace_elementSize,
hipStream_t stream_id=0);
namespace turbomind {
......@@ -17,6 +33,8 @@ void dequant_w4_gemm(cudaStream_t stream, half* output,const uint32_t* weight,co
void addFusedSiluActivation(cudaStream_t stream,half* output, const half* src,int m,int n,int type);
void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t* weight,const half2* zeros_and_scales,int k, int n, int group_size);
template <typename T>
void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
class GemmS4F16 {
public:
......
......@@ -20,12 +20,12 @@ add_library(Llama STATIC
llama_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu
./awq_sugon/gemm_w4_dequation.cu)
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_directories(Llama PUBLIC ../../../../3rdparty/)
target_link_directories(Llama PUBLIC ../../../../3rdparty/composable_kernel/)
target_link_libraries(Llama PUBLIC cudart
gemm_s4_f16
cublasMMWrapper
......
......@@ -2,7 +2,6 @@
#pragma once
#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
......
#include "src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh"
#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
template <typename T>
__global__ void add_kernel(int n,T* A,const T* B)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
A[id]=A[id]+B[id];
}
template <typename T>
__global__ void assign_kernel(int n,T* A,const T* B)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
A[id]=B[id];
}
template <typename T>
void assign_fun(cudaStream_t stream, T* A,const T* B,int size)
{
int num_kernels=size;
assign_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels,A,B);
}
#define INSTANTIATEASSIGN(T) \
template void assign_fun(cudaStream_t stream, T* A,const T* B,int size);
INSTANTIATEASSIGN(__half)
INSTANTIATEASSIGN(float)
INSTANTIATEASSIGN(half2)
INSTANTIATEASSIGN(uint)
template <typename T>
void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n){
printf("start printf ****\n");
int input_size=size;
T* h_data;
h_data=new T[input_size];
T* d_data;
cudaMalloc((void**)&d_data, input_size * sizeof(T));
//进行初始化
// for(int i=0;i<input_size;i++)
// {
// h_data[i] = __float2half(2.0f);
// }
// cudaMemcpy(d_data, h_data, input_size * sizeof(T), cudaMemcpyHostToDevice);
assign_fun<T>(stream,d_data,data,input_size);
cudaStreamSynchronize(stream);
cudaMemcpy(h_data,d_data, input_size * sizeof(T), cudaMemcpyDeviceToHost);
if(flag!=0)
{
std::string file_name="/FrameWork/nvidia_file/elsetest/data"+std::to_string(flag)+".bin";
std::ofstream outfile(file_name, std::ios::binary);
if (!outfile) {
std::cerr << "Failed to open the file for writing." << std::endl;
}
outfile.write(reinterpret_cast<const char*>(h_data), m*n*sizeof(T));
outfile.close();
}
if constexpr (std::is_same_v<T, half>)
{
for(int i=0;i<input_size;i++)
{
printf("%f ",__half2float(h_data[i]));
}
}
else if constexpr(std::is_same_v<T, half2>)
{
for(int i=0;i<input_size;i++)
{
printf("x:%f y:%f ",__half2float(h_data[i].data[0]),__half2float(h_data[i].data[1]));
}
}
else if constexpr(std::is_same_v<T, uint>)
{
for(int i=0;i<input_size;i++)
{
printf(" %u ",h_data[i]);
}
}
printf("\n");
delete[] h_data;
cudaFree(d_data);
return ;
}
#define INSTANTIATEPRINT(T) \
template void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n);
INSTANTIATEPRINT(__half)
INSTANTIATEPRINT(float)
INSTANTIATEPRINT(half2)
INSTANTIATEPRINT(uint32_t)
template <typename T>
__global__ void input_padding_kernel(int num_kernels,T* output,const T* input,int m,int k,int group_size,int count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= num_kernels) return;
int j=id%(k+count*group_size);
int i=id/(k+count*group_size);
if(j<k)
{
output[i*(k+count*group_size)+j]=input[i*(k)+j];
}
else
{
output[i*(k+count*group_size)+j]=0.f;
}
}
template <typename T>
void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int num_kernels=m*(k+pad_groupcount*group_size);
input_padding_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels, output,input,m,k,group_size,pad_groupcount);
}
#define INSTANTIATEINPUTPADING(T) \
template void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
INSTANTIATEINPUTPADING(__half)
#pragma once
#include "src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh"
#include <string>
#include <iostream>
#include <fstream>
typedef struct ihipStream_t* hipStream_t;
// template <typename T>
// void dequant_w4_gemm(cudaStream_t stream, T* output,const uint32_t* weight,const half2* zeros_and_scales,int k,int n,int group_size);
template <typename T>
void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n);
template <typename T>
void assign_fun(cudaStream_t stream, T* A,const T* B,int size);
extern void run_weight_only_gemm(const void *A,
const void *B0,
const void *B1,
void *C,
int M,
int N,
int K,
int StrideA,
int StrideB,
int StrideB_padded, // 输入的权重矩阵添加pad后的K
int StrideC,
int Group,
void* splitK_padA_workspace, // 用于SplitK和tensorA添加pad的显存空间
int splitK_padA_workspace_elementSize, // workspace有多少个bit
hipStream_t stream_id=0);
template <typename T>
void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
#define BLOCKSIZE 256
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <cassert>
#include <cstdint>
#include <type_traits>
#include <sys/time.h>
#pragma once
struct my_timer
{
timeval ts, te; //起始时刻,终止时刻
float dt; // 时间间隔,单位毫秒(ms)
void start(){
gettimeofday(&ts, NULL);
}
void stop(){
gettimeofday(&te, NULL);
long int dt_sec = te.tv_sec - ts.tv_sec;
long int dt_usec = te.tv_usec - ts.tv_usec;
dt = dt_sec * 1.0e3 + dt_usec / 1.0e3;
}
};
\ No newline at end of file
......@@ -46,21 +46,17 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
m_weightlayout_switch = std::stoi(env_weightlayout_str);
}
const char* env_dump_str = std::getenv("LMDEPLOY_DUMP_SWITCH");
if (env_dump_str != nullptr) {
m_dump_switch = std::stoi(env_dump_str);
}
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
//当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
{
//需要反量化后weight临时存储的空间
printf("alloc space for deqeight\n");
deweight_workspace_=allocator_->reMalloc(deweight_workspace_, DEQ_WORKSPACE_SIZE, false);
if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1)
{
printf("alloc space for xpading\n");
printf("weight layout is tn pading rocblas\n");
......@@ -109,21 +105,17 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
m_weightlayout_switch = std::stoi(env_weightlayout_str);
}
const char* env_dump_str = std::getenv("LMDEPLOY_DUMP_SWITCH");
if (env_dump_str != nullptr) {
m_dump_switch = std::stoi(env_dump_str);
}
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
//当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
{
//需要反量化后weight临时存储的空间
printf("alloc space for deqeight\n");
deweight_workspace_=allocator_->reMalloc(deweight_workspace_, DEQ_WORKSPACE_SIZE, false);
if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch=1))
if(m_weightlayout_switch ==1)
{
printf("alloc space for xpading\n");
printf("weight layout is tn pading rocblas\n");
......@@ -146,12 +138,12 @@ cublasMMWrapper::~cublasMMWrapper()
mu_ = nullptr;
if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_));
if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
{
//需要反量化后weight临时存储的空间
printf("free space for deqeight\n");
allocator_->free((void**)(&deweight_workspace_));
if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1)
{
printf("free space for xpading\n");
allocator_->free((void**)(&xpading_workspace_));
......@@ -182,21 +174,17 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
m_weightlayout_switch = std::stoi(env_weightlayout_str);
}
const char* env_dump_str = std::getenv("LMDEPLOY_DUMP_SWITCH");
if (env_dump_str != nullptr) {
m_dump_switch = std::stoi(env_dump_str);
}
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
//当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
{
//需要反量化后weight临时存储的空间
printf("alloc space for deqeight\n");
deweight_workspace_=allocator_->reMalloc(deweight_workspace_, DEQ_WORKSPACE_SIZE, false);
if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch==1))
if(m_weightlayout_switch ==1)
{
printf("alloc space for xpading\n");
printf("weight layout is tn pading rocblas\n");
......
......@@ -74,8 +74,8 @@ public:
//x的pad
void* xpading_workspace_ = nullptr;
void* deweight_workspace_ = nullptr;
int m_weightlayout_switch = 1;
int m_dump_switch = 0;
int m_weightlayout_switch = 0;
cublasMMWrapper(cublasHandle_t cublas_handle_,
cublasLtHandle_t cublaslt_handle_,
cudaStream_t stream,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment