增加awq相关so 以及更新so存放位置，取消awq_sugon文件夹，相关函数合入kernels/gemm_s_f16

09a80a64 · gaoqiong · 617e86ea · 09a80a64 · 09a80a64 · 09a80a64
Commit 09a80a64 authored May 28, 2024 by gaoqiong
11 changed files
--- a/setup.py
+++ b/setup.py
@@ -69,20 +69,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
        file.writelines(lines)
    file.close()

-def copy_ck_so():
-    lmdeploy_root = os.path.dirname(os.path.abspath(__file__))
-    so_path = os.path.join(os.path.join(lmdeploy_root, "3rdparty"), "libgemm_multiB_int4.so")
-    # dtk version
-    if os.getenv("ROCM_PATH"):
-        rocm_path = os.getenv('ROCM_PATH', "")    
-        rocm_so_path = os.path.join(rocm_path, 'lib')
-        print("rocm_so_path:",rocm_so_path)
-        shutil.copy(so_path, rocm_so_path)
-    else:
-        shutil.copy(so_path, "usr/local/lib")
-
 def get_version():
-    copy_ck_so()
    get_version_add()
    version_file = 'lmdeploy/version.py'
    with open(version_file, encoding='utf-8') as f:
@@ -197,9 +184,24 @@ def parse_requirements(fname='requirements.txt', with_version=True):
    packages += cuda_pkgs
    return packages

+def copy_ck_so():
+    lmdeploy_root = os.path.dirname(os.path.abspath(__file__))
+    so_path = os.path.join(os.path.join(lmdeploy_root, "3rdparty","composable_kernel"), "libgemm_multiB_int4.so")
+    # dtk version
+    target_path=os.path.join(lmdeploy_root, "lmdeploy","lib")
+    if os.path.exists(target_path):
+        shutil.copy(so_path, target_path)
+    elif os.getenv("ROCM_PATH"):
+        rocm_path = os.getenv('ROCM_PATH', "")    
+        rocm_so_path = os.path.join(rocm_path, 'lib')
+        print("rocm_so_path:",rocm_so_path)
+        shutil.copy(so_path, rocm_so_path)
+    else:
+        shutil.copy(so_path, "usr/local/lib")

 if __name__ == '__main__':
    lmdeploy_package_data = ['lmdeploy/bin/llama_gemm']
+    copy_ck_so()
    setup(
        name='lmdeploy',
        version=get_version(),

--- a/src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
+++ b/src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
@@ -2,7 +2,7 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")

-add_library(gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu ../../models/llama/awq_sugon/gemm_w4_dequation.cu)
+add_library(gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu)
 target_compile_options(gemm_s4_f16 PRIVATE
  --generate-line-info -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr)
 set_property(TARGET gemm_s4_f16 PROPERTY POSITION_INDEPENDENT_CODE ON)

--- a/src/turbomind/kernels/gemm_s_f16/format.cu
+++ b/src/turbomind/kernels/gemm_s_f16/format.cu
 // Copyright (c) OpenMMLab. All rights reserved.

 #include "common.h"
-#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
 #include <iostream>
+#define BLOCKSIZE 256

 namespace turbomind {

@@ -245,4 +245,40 @@ void addFusedSiluActivation(cudaStream_t stream,half* output, const half* src,in
            return;
    }   
 }
+
+template <typename T>
+__global__ void input_padding_kernel(int num_kernels,T* output,const T* input,int m,int k,int group_size,int count)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if(id >= num_kernels) return;
+
+    int j=id%(k+count*group_size);
+    int i=id/(k+count*group_size);
+
+    if(j<k)
+    {
+        output[i*(k+count*group_size)+j]=input[i*(k)+j];
+    }
+    else
+    {
+        output[i*(k+count*group_size)+j]=0.f;
+    } 
+}
+
+
+template <typename T>
+void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount)
+{
+    //input的size是[m,k],output的size是[m,n+group_size]
+    //
+    int num_kernels=m*(k+pad_groupcount*group_size);
+    input_padding_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels, output,input,m,k,group_size,pad_groupcount);
+}
+
+
+#define INSTANTIATEINPUTPADING(T)  \
+template void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
+
+INSTANTIATEINPUTPADING(__half)
+
 }  // namespace turbomind
--- a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+++ b/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
@@ -9,6 +9,22 @@

 #include <memory>
 #include <vector>
+typedef struct ihipStream_t* hipStream_t;
+extern void run_weight_only_gemm(const void *A,
+                        const void *B0,
+                        const void *B1,
+                        void *C,
+                        int M,
+                        int N,
+                        int K,
+                        int StrideA,
+                        int StrideB,
+                        int StrideB_padded,     
+                        int StrideC,
+                        int Group,
+                        void* splitK_padA_workspace,                  
+                        int splitK_padA_workspace_elementSize,        
+                        hipStream_t stream_id=0);

 namespace turbomind {

@@ -17,6 +33,8 @@ void dequant_w4_gemm(cudaStream_t stream, half* output,const uint32_t* weight,co

 void addFusedSiluActivation(cudaStream_t stream,half* output, const half* src,int m,int n,int type);
 void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t* weight,const half2* zeros_and_scales,int k, int n, int group_size);
+template <typename T>
+void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);

 class GemmS4F16 {
 public:

--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -20,12 +20,12 @@ add_library(Llama STATIC
        llama_kernels.cu
        llama_decoder_kernels.cu
        llama_utils.cu
-        ./awq_sugon/gemm_w4_dequation.cu)
+        )
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
 #set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 #set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_directories(Llama PUBLIC ../../../../3rdparty/)
+target_link_directories(Llama PUBLIC ../../../../3rdparty/composable_kernel/)
 target_link_libraries(Llama PUBLIC cudart
        gemm_s4_f16
        cublasMMWrapper

--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -2,7 +2,6 @@

 #pragma once

-#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
 #include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/llama_kernels.h"

--- a/src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cu
+++ b/src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cu
-#include "src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh"
-#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
-
-
-template <typename T>
-__global__ void add_kernel(int n,T* A,const T* B)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if(id >= n) return;
-
-    A[id]=A[id]+B[id];
-}
-
-template <typename T>
-__global__ void assign_kernel(int n,T* A,const T* B)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if(id >= n) return;
-
-    A[id]=B[id];
-}
-
-
-template <typename T>
-void assign_fun(cudaStream_t stream, T* A,const  T* B,int size)
-{
-    int num_kernels=size;
-    assign_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels,A,B);
-}
-
-#define INSTANTIATEASSIGN(T)  \
-template void assign_fun(cudaStream_t stream, T* A,const T* B,int size);
-
-INSTANTIATEASSIGN(__half)
-INSTANTIATEASSIGN(float)
-INSTANTIATEASSIGN(half2)
-INSTANTIATEASSIGN(uint)
-
-
-template <typename T>
-void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n){
-    printf("start printf ****\n");
-    int input_size=size;
-    T* h_data;
-    h_data=new T[input_size];
-
-    T* d_data;
-    cudaMalloc((void**)&d_data, input_size * sizeof(T));
-
-    //进行初始化
-    // for(int i=0;i<input_size;i++)
-    // {
-    //     h_data[i] = __float2half(2.0f);
-    // }
-
-    // cudaMemcpy(d_data, h_data, input_size * sizeof(T), cudaMemcpyHostToDevice);
-    assign_fun<T>(stream,d_data,data,input_size);
-    cudaStreamSynchronize(stream);
-
-    cudaMemcpy(h_data,d_data, input_size * sizeof(T), cudaMemcpyDeviceToHost);
-
-    if(flag!=0)
-    {
-        std::string file_name="/FrameWork/nvidia_file/elsetest/data"+std::to_string(flag)+".bin";
-        std::ofstream outfile(file_name, std::ios::binary);
-        if (!outfile) {
-            std::cerr << "Failed to open the file for writing." << std::endl;
-        }
-        outfile.write(reinterpret_cast<const char*>(h_data), m*n*sizeof(T));
-        outfile.close();
-    }
-
-    if constexpr (std::is_same_v<T, half>)
-    {
-        for(int i=0;i<input_size;i++)
-        {
-            printf("%f ",__half2float(h_data[i]));
-        }
-    }
-    else if constexpr(std::is_same_v<T, half2>)
-    {
-        for(int i=0;i<input_size;i++)
-        {
-            printf("x:%f  y:%f ",__half2float(h_data[i].data[0]),__half2float(h_data[i].data[1]));
-        }
-    }
-    else if constexpr(std::is_same_v<T, uint>)
-    {
-        for(int i=0;i<input_size;i++)
-        {
-            printf(" %u ",h_data[i]);
-        }
-    }
-    printf("\n");
-    delete[] h_data;
-    cudaFree(d_data);
-    return ;
-}
-
-
-#define INSTANTIATEPRINT(T)  \
-template void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n);
-
-INSTANTIATEPRINT(__half)
-INSTANTIATEPRINT(float)
-INSTANTIATEPRINT(half2)
-INSTANTIATEPRINT(uint32_t)
-
-template <typename T>
-__global__ void input_padding_kernel(int num_kernels,T* output,const T* input,int m,int k,int group_size,int count)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if(id >= num_kernels) return;
-
-    int j=id%(k+count*group_size);
-    int i=id/(k+count*group_size);
-
-    if(j<k)
-    {
-        output[i*(k+count*group_size)+j]=input[i*(k)+j];
-    }
-    else
-    {
-        output[i*(k+count*group_size)+j]=0.f;
-    } 
-}
-
-
-template <typename T>
-void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount)
-{
-    //input的size是[m,k],output的size是[m,n+group_size]
-    //
-    int num_kernels=m*(k+pad_groupcount*group_size);
-    input_padding_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels, output,input,m,k,group_size,pad_groupcount);
-}
-
-
-#define INSTANTIATEINPUTPADING(T)  \
-template void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
-
-INSTANTIATEINPUTPADING(__half)
-
--- a/src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh
+++ b/src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh
-#pragma once
-#include "src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh"
-#include <string>
-#include <iostream>
-#include <fstream>
-typedef struct ihipStream_t* hipStream_t;
-// template <typename T>
-// void dequant_w4_gemm(cudaStream_t stream, T* output,const uint32_t* weight,const half2* zeros_and_scales,int k,int n,int group_size);
-
-
-template <typename T>
-void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n);
-
-
-template <typename T>
-void assign_fun(cudaStream_t stream, T* A,const T* B,int size);
-
-extern void run_weight_only_gemm(const void *A,
-                        const void *B0,
-                        const void *B1,
-                        void *C,
-                        int M,
-                        int N,
-                        int K,
-                        int StrideA,
-                        int StrideB,
-                        int StrideB_padded,     // 输入的权重矩阵添加pad后的K
-                        int StrideC,
-                        int Group,
-                        void* splitK_padA_workspace,                 // 用于SplitK和tensorA添加pad的显存空间
-                        int splitK_padA_workspace_elementSize,       // workspace有多少个bit
-                        hipStream_t stream_id=0);
-
-
-template <typename T>
-void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
--- a/src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh
+++ b/src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh
-#define BLOCKSIZE 256
-
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <stdio.h> 
-#include <cassert>
-#include <cstdint>
-#include <type_traits>
-#include <sys/time.h>
-
-#pragma once
-struct my_timer
-{
-    timeval ts, te; //起始时刻，终止时刻
-    float dt; // 时间间隔,单位毫秒(ms)
-    void start(){
-        gettimeofday(&ts, NULL);
-    }
-    void stop(){
-        gettimeofday(&te, NULL);
-        long int dt_sec  = te.tv_sec - ts.tv_sec;
-        long int dt_usec = te.tv_usec - ts.tv_usec;
-        dt = dt_sec * 1.0e3 + dt_usec / 1.0e3;
-    }
-};
\ No newline at end of file
--- a/src/turbomind/utils/cublasMMWrapper.cc
+++ b/src/turbomind/utils/cublasMMWrapper.cc
@@ -46,21 +46,17 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
        m_weightlayout_switch = std::stoi(env_weightlayout_str);
    }

-    const char* env_dump_str = std::getenv("LMDEPLOY_DUMP_SWITCH");
-    if (env_dump_str != nullptr) {
-        m_dump_switch = std::stoi(env_dump_str);
-    }
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (allocator_ != nullptr) {
        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);

        //当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
-        if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
+        if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
        {
            //需要反量化后weight临时存储的空间
            printf("alloc space for deqeight\n");
            deweight_workspace_=allocator_->reMalloc(deweight_workspace_, DEQ_WORKSPACE_SIZE, false);  
-            if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch==1))
+            if(m_weightlayout_switch ==1)
            {
                printf("alloc space for xpading\n");
                printf("weight layout is tn pading rocblas\n");
@@ -109,21 +105,17 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t     cublas_handle,
        m_weightlayout_switch = std::stoi(env_weightlayout_str);
    }
    
-    const char* env_dump_str = std::getenv("LMDEPLOY_DUMP_SWITCH");
-    if (env_dump_str != nullptr) {
-        m_dump_switch = std::stoi(env_dump_str);
-    }
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (allocator_ != nullptr) {
        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);

        //当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
-        if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
+        if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
        {
            //需要反量化后weight临时存储的空间
            printf("alloc space for deqeight\n");
            deweight_workspace_=allocator_->reMalloc(deweight_workspace_, DEQ_WORKSPACE_SIZE, false);  
-            if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch=1))
+            if(m_weightlayout_switch ==1)
            {
                printf("alloc space for xpading\n");
                printf("weight layout is tn pading rocblas\n");
@@ -146,12 +138,12 @@ cublasMMWrapper::~cublasMMWrapper()
    mu_ = nullptr;
    if (allocator_ != nullptr) {
        allocator_->free((void**)(&cublas_workspace_));
-        if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
+        if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
        {
            //需要反量化后weight临时存储的空间
            printf("free space for deqeight\n");
            allocator_->free((void**)(&deweight_workspace_)); 
-            if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch==1))
+            if(m_weightlayout_switch ==1)
            {
                printf("free space for xpading\n");
                allocator_->free((void**)(&xpading_workspace_)); 
@@ -182,21 +174,17 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
        m_weightlayout_switch = std::stoi(env_weightlayout_str);
    }  
    
-    const char* env_dump_str = std::getenv("LMDEPLOY_DUMP_SWITCH");
-    if (env_dump_str != nullptr) {
-        m_dump_switch = std::stoi(env_dump_str);
-    }
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
    if (allocator_ != nullptr) {
        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
        
        //当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
-        if(m_weightlayout_switch ==1||m_weightlayout_switch==0||(m_weightlayout_switch==2&&m_dump_switch==1))
+        if(m_weightlayout_switch ==1||m_weightlayout_switch==0)
        {
            //需要反量化后weight临时存储的空间
            printf("alloc space for deqeight\n");
            deweight_workspace_=allocator_->reMalloc(deweight_workspace_, DEQ_WORKSPACE_SIZE, false);  
-            if(m_weightlayout_switch ==1||(m_weightlayout_switch==2&&m_dump_switch==1))
+            if(m_weightlayout_switch ==1)
            {
                printf("alloc space for xpading\n");
                printf("weight layout is tn pading rocblas\n");

--- a/src/turbomind/utils/cublasMMWrapper.h
+++ b/src/turbomind/utils/cublasMMWrapper.h
@@ -74,8 +74,8 @@ public:
    //x的pad
    void*       xpading_workspace_ = nullptr;
    void*       deweight_workspace_ = nullptr;
-    int m_weightlayout_switch = 1;
-    int m_dump_switch = 0;
+    int m_weightlayout_switch = 0;
+
    cublasMMWrapper(cublasHandle_t   cublas_handle_,
                    cublasLtHandle_t cublaslt_handle_,
                    cudaStream_t     stream,