Commit e38ee081 authored by xiabo's avatar xiabo
Browse files

Adapt to rocm

parent 56942c43
......@@ -44,18 +44,18 @@ option(BUILD_TEST "Build tests" OFF)
include(FetchContent)
FetchContent_Declare(
repo-cutlass
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
GIT_TAG 6f47420213f757831fae65c686aa471749fa8d60
)
#FetchContent_Declare(
# repo-cutlass
# GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
# GIT_TAG cc85b64cf676c45f98a17e3a47c0aafcf817f088
#)
set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
#set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_MakeAvailable(repo-cutlass)
#FetchContent_MakeAvailable(repo-cutlass)
set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
#set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
#set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
......@@ -110,6 +110,7 @@ endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") # -Xptxas -v
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --gpu-max-threads-per-block=1024")
set(SM_SETS 52 60 61 70 75 80 86 89 90)
set(USING_WMMA False)
......@@ -266,7 +267,8 @@ endif()
if (BUILD_MULTI_GPU)
list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
#list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
list(APPEND COMMON_LIB_DIRS /opt/mpi/lib)
endif()
if(USE_TRITONSERVER_DATATYPE)
......@@ -311,11 +313,11 @@ endif()
add_library(transformer-shared SHARED
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:llama_fmha>
# $<TARGET_OBJECTS:llama_fmha>
$<TARGET_OBJECTS:flash_attention2>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:gemm_s4_f16>
# $<TARGET_OBJECTS:gemm_s4_f16>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TransformerTritonBackend>
......@@ -353,15 +355,16 @@ target_link_libraries(transformer-shared PUBLIC
endif()
if(USE_NVTX)
target_link_libraries(transformer-shared PUBLIC
-lnvToolsExt
)
#target_link_libraries(transformer-shared PUBLIC
# -lnvToolsExt
#)
endif()
set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
#set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
#target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcurand)
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMind)
......
#!/bin/sh
cmake .. \
-DCMAKE_CXX_COMPILER=nvcc \
-DCMAKE_C_COMPILER=nvcc \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_INSTALL_PREFIX=./install \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON
-DUSE_NVTX=OFF \
# -DBUILD_TEST=ON
......@@ -13,61 +13,63 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
add_library(ban_bad_words STATIC ban_bad_words.cu)
set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(stop_criteria STATIC stop_criteria_kernels.cu)
set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(activation_kernels STATIC activation_kernels.cu)
set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(logprob_kernels STATIC logprob_kernels.cu)
set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(unfused_attention_kernels STATIC unfused_attention_kernels.cu)
set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(bert_preprocess_kernels STATIC bert_preprocess_kernels.cu)
set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set(decoder_masked_multihead_attention_files
decoder_masked_multihead_attention.cu
)
file(GLOB decoder_masked_multihead_attention_files ${decoder_masked_multihead_attention_files} ./decoder_masked_multihead_attention/*.cu)
add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead_attention_files})
set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(decoding_kernels STATIC decoding_kernels.cu)
set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(gpt_kernels STATIC gpt_kernels.cu)
set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(sampling_topk_kernels STATIC sampling_topk_kernels.cu)
set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(sampling_topp_kernels STATIC sampling_topp_kernels.cu)
set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(sampling_penalty_kernels STATIC sampling_penalty_kernels.cu)
set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(custom_ar_kernels STATIC custom_ar_kernels.cu)
set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_subdirectory(gemm_s_f16)
#add_subdirectory(gemm_s_f16)
......@@ -106,7 +106,8 @@ struct ReluActivation<half2> {
static __device__ __forceinline__ half2 apply(const half2& val)
{
const half zero_half = static_cast<half>(0.0f);
return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
// return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
return make_half2(static_cast<half>(val.data[0]) > zero_half ? static_cast<half>(val.data[0]) : zero_half, static_cast<half>(val.data[1]) > zero_half ? static_cast<half>(val.data[1]) : zero_half);
}
};
......@@ -117,7 +118,8 @@ struct ReluActivation<__nv_bfloat162> {
static __device__ __forceinline__ __nv_bfloat162 apply(const __nv_bfloat162& val)
{
const __nv_bfloat16 zero_bf16 = static_cast<__nv_bfloat16>(0.0f);
return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16);
// return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16);
return make_bfloat162(val.data[0] > zero_bf16 ? val.data[0] : zero_bf16, val.data[1] > zero_bf16 ? val.data[1] : zero_bf16);
}
};
#endif
......@@ -138,7 +140,8 @@ struct SiluActivation<half2> {
using return_type = float2;
static __device__ __forceinline__ float2 apply(const half2& val)
{
return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
// return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
return make_float2(SiluActivation<float>::apply(val.data[0]), SiluActivation<float>::apply(val.data[1]));
}
};
......@@ -148,7 +151,8 @@ struct SiluActivation<__nv_bfloat162> {
using return_type = float2;
static __device__ __forceinline__ float2 apply(const __nv_bfloat162& val)
{
return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
// return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
return make_float2(SiluActivation<float>::apply(val.data[0]), SiluActivation<float>::apply(val.data[1]));
}
};
#endif // ENABLE_BF16
......
......@@ -24,7 +24,12 @@ namespace turbomind {
static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
{
uint32_t c;
asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
// asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
const __half * ha = reinterpret_cast<const __half*>(&a);
const __half * hb = reinterpret_cast<const __half*>(&b);
__half2 h2c = make_half2(ha[0] + hb[0], ha[1] + hb[1]);
__builtin_memcpy(&c, &h2c, sizeof(h2c));
// asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
return c;
}
......@@ -33,7 +38,12 @@ static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
{
uint32_t c;
asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
// asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
union {float *f_p; const uint32_t *u_p;} x, y, z;
x.u_p = &a;
y.u_p = &b;
z.u_p = &c;
*z.f_p = *x.f_p + *y.f_p;
return c;
}
......@@ -42,10 +52,12 @@ static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_addr)
{
#if __CUDA_ARCH__ >= 700
asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
// asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
*flag_addr = flag;
#else
__threadfence_system();
asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
// asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
*flag_addr = flag;
#endif
}
......@@ -54,9 +66,11 @@ static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_add
static inline __device__ void ld_flag_acquire(uint32_t& flag, uint32_t* flag_addr)
{
#if __CUDA_ARCH__ >= 700
asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
// asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
flag = *flag_addr;
#else
asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
// asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
flag = *flag_addr;
#endif
}
......
......@@ -27,7 +27,8 @@
#define MAX_ALL_REDUCE_BLOCKS 24
#define FLAG(a) ((uint32_t)((a) % 0x146))
#define RANKS_PER_NODE 8
#define WARP_SIZE 32
// #define WARP_SIZE 32
#define WARP_SIZE 64
#define DEFAULT_BLOCK_SIZE 1024
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
......
......@@ -628,19 +628,68 @@ struct Qk_dot {
};
////////////////////////////////////////////////////////////////////////////////////////////////////
__device__ inline void f16mulf16addf32(uint32_t & a, uint32_t & b, const float * c, float * d){
// uint32_t res = 0;
// asm volatile("v_pk_fma_f16 %0, %1,%2,%3" : "=v"(res) : "v"(a), "v"(b), "v"(res));
// __half * h = reinterpret_cast<__half*>(&res);
__half * ha = reinterpret_cast<__half*>(&a);
__half * hb = reinterpret_cast<__half*>(&b);
*d = *c + __half2float(ha[0])*__half2float(hb[0]) + __half2float(ha[1])*__half2float(hb[1]);
}
// row 8 col 4
__device__ inline void m16n8k8(const uint32_t * A, const uint32_t * B, /*const float * C,*/ float * D) {
int tid = threadIdx.x;
int baseId = tid / 32 * 32;
__shared__ uint32_t smem[1024*3];
int base = tid*3;
__builtin_memcpy(smem+base, A, sizeof(uint32_t));
__builtin_memcpy(smem+(base+1), A+1, sizeof(uint32_t));
__builtin_memcpy(smem+(base+2), B, sizeof(uint32_t));
__syncthreads();
/* 站在D的视角,每个进程负责D数据的计算,从0线程开始循环,获取一行A和两列B
s为B矩阵的线程号
baseA为A的线程号
baseB0为当前线程获取B的第一列,baseB1为当前线程获取B的第二列
*/
int s = baseId+(tid%4)*8, e = s+4;
for (int i = s; i < e; ++i) {
// A[0]->i A[1]->i+1 B[0]->i+2
int baseA = (tid-tid%4+i-s)*3; // 当前tid所处行的第一列的进程号+stride 再*3
int baseB0 = i*3, baseB1 = (i+4)*3;
f16mulf16addf32(smem[baseA], smem[baseB0+2], D, D);
f16mulf16addf32(smem[baseA], smem[baseB1+2], D+1, D+1);
f16mulf16addf32(smem[baseA+1], smem[baseB0+2], D+2, D+2);
f16mulf16addf32(smem[baseA+1], smem[baseB1+2], D+3, D+3);
}
}
inline __device__ float4 hmma_fp32(const uint2& a, uint32_t b)
{
float4 c;
float zero = 0.f;
asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
" {%0, %1, %2, %3}, \n"
" {%4, %5}, \n"
" {%6}, \n"
" {%7, %7, %7, %7}; \n"
: "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
: "r"(a.x), "r"(a.y), "r"(b), "f"(zero));
const uint32_t * A = reinterpret_cast<const uint32_t*>(&a);
const uint32_t * B = reinterpret_cast<const uint32_t*>(b);
float * C = reinterpret_cast<float*>(&c);
m16n8k8(A, B, C);
// asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
// " {%0, %1, %2, %3}, \n"
// " {%4, %5}, \n"
// " {%6}, \n"
// " {%7, %7, %7, %7}; \n"
// : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
// : "r"(a.x), "r"(a.y), "r"(b), "f"(zero));
return c;
}
......@@ -688,7 +737,8 @@ struct Qk_dot<uint16_t, 4> {
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
// template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
template<int WARPS_PER_BLOCK, int WARP_SIZE = 64>
inline __device__ float block_sum(float* red_smem, float sum)
{
......@@ -1110,12 +1160,21 @@ inline __device__ Float8_ dequant(int64_t a, const float scale, const float zp)
inline __device__ int8_t cast_to_int8(float val)
{
union {
int8_t int8[2];
int16_t int16;
};
asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
return int8[0];
// union {
// int8_t int8[2];
// int16_t int16;
// };
// asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
// return int8[0];
int8_t dst;
if (val >= 128){
dst = 127;
}else if (val < -128){
dst = -128;
}else{
dst = static_cast<int8_t>(val);
}
return dst;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
......@@ -1239,7 +1298,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");
// The size of a warp.
constexpr int WARP_SIZE = 32;
// constexpr int WARP_SIZE = 32;
constexpr int WARP_SIZE = 64;
// The number of warps in a threadblock.
constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
......
......@@ -147,7 +147,8 @@ inline __device__ float4 add(float4 a, float4 b)
inline __device__ uint16_t add(uint16_t a, uint16_t b)
{
uint16_t c;
asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
// asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
asm volatile("v_add_f16 %0, %1, %2;" : "=v"(c) : "v"(a), "v"(b));
return c;
}
......@@ -156,7 +157,11 @@ inline __device__ uint16_t add(uint16_t a, uint16_t b)
inline __device__ uint32_t add(uint32_t a, uint32_t b)
{
uint32_t c;
asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
// asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
const __half *ha = reinterpret_cast<const __half*>(&a);
const __half *hb = reinterpret_cast<const __half*>(&b);
__half2 h2c = make_half2(ha[0]+hb[0], ha[1]+hb[1]);
__builtin_memcpy(&c, &h2c, sizeof(h2c));
return c;
}
......@@ -192,9 +197,13 @@ inline __device__ uint16_t float_to_half(float f)
} tmp;
#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // Is it better?
float zero = 0.f;
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
// asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
__half h=__float2half(f);
tmp.u16[0] = reinterpret_cast<const uint16_t&>(h);
#else
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
__half h=__float2half(f);
tmp.u16[0] = reinterpret_cast<const uint16_t&>(h);
#endif
return tmp.u16[0];
}
......@@ -208,10 +217,18 @@ inline __device__ uint32_t float2_to_half2(float2 f)
uint16_t u16[2];
} tmp;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
// asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
__half h1 = __float2half(f.x);
__half h2 = __float2half(f.y);
tmp.u16[0] = reinterpret_cast<const uint16_t&>(h1);
tmp.u16[1] = reinterpret_cast<const uint16_t&>(h2);
#else
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
__half h1 = __float2half(f.x);
__half h2 = __float2half(f.y);
tmp.u16[0] = reinterpret_cast<const uint16_t&>(h1);
tmp.u16[1] = reinterpret_cast<const uint16_t&>(h2);
#endif
return tmp.u32;
}
......@@ -221,7 +238,8 @@ inline __device__ uint32_t float2_to_half2(float2 f)
inline __device__ float half_to_float(uint16_t h)
{
float f;
asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
// asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
f = __half2float(reinterpret_cast<const __half&>(h));
return f;
}
......@@ -230,7 +248,9 @@ inline __device__ float half_to_float(uint16_t h)
inline __device__ float2 half2_to_float2(uint32_t v)
{
uint16_t lo, hi;
asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
// asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
lo = v & 0xffff;
hi = (v >> 16) & 0xffff;
return make_float2(half_to_float(lo), half_to_float(hi));
}
......@@ -276,7 +296,11 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb)
inline __device__ uint32_t h0_h0(uint16_t a)
{
uint32_t b;
asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
// asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
uint16_t tmp[2];
tmp[0] = a;
tmp[1] = a;
__builtin_memcpy(&b, tmp, sizeof(uint16_t) * 2);
return b;
}
......@@ -370,7 +394,8 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c)
{
uint32_t d;
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
// asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
return d;
}
......@@ -581,7 +606,8 @@ template<>
inline __device__ uint16_t mul(uint16_t a, uint16_t b)
{
uint16_t c;
asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
// asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
return c;
}
......@@ -591,7 +617,8 @@ template<>
inline __device__ uint32_t mul(uint32_t a, uint32_t b)
{
uint32_t c;
asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
// asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
return c;
}
......
......@@ -20,7 +20,8 @@
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/utils/memory_utils.h"
......
......@@ -23,7 +23,8 @@
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#include "src/turbomind/kernels/logprob_kernels.h"
......
......@@ -21,7 +21,8 @@
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
......
......@@ -19,7 +19,8 @@
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
......
......@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
// Check if we have attained the sequence length limit. If so, stop the sequence.
// In addition, check if all sequences are stopped and return the result in should_stop
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
dim3 block{min(512, uint32_t(batch_size * beam_width))};
// dim3 block{min(512, uint32_t(batch_size * beam_width))};
dim3 block{static_cast<unsigned int>(min(512, uint32_t(batch_size * beam_width)))};
dim3 grid{1};
h_pinned_finished_sum_[0] = -1;
......
......@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
qk_bias = hadd2<T2>(qk_bias, hmul2<T2>(hsub2<T2>(ONE, mask_val), NEG_INFTY));
data[i] = hadd2<T2>(hmul2<T2>(qk, qk_scale_h2), qk_bias);
local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y));
// if (std::is_same<T2, half2>::value) {
local_max = fmax(local_max, fmax((float)data[i].data[0], (float)data[i].data[1]));
// } else {
// local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y));
// }
}
float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax<float>(local_max);
......@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
float local_sum = 0.0f;
for (int i = 0; blockDim.x * i + threadIdx.x < (k_length / 2) && i < ITEMS_PER_THREAD; i++) {
data[i] = hexp2<T2>(hsub2<T2>(data[i], cuda_cast<T2>(s_max)));
local_sum += (float)(data[i].x + data[i].y);
// if (std::is_same<T2, half2>::value) {
local_sum += (float)(data[i].data[0] + data[i].data[1]);
// } else {
// local_sum += (float)(data[i].x + data[i].y);
// }
}
float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum<float>(local_sum);
......@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
val = hadd2<T2>(val, pos_bias[j]);
}
data[j][i] = val;
local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y));
// if (std::is_same<T2, half2>::value) {
local_max[j] = fmax(local_max[j], fmax((float)data[j][i].data[0], (float)data[j][i].data[1]));
// } else {
// local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y));
// }
}
}
......@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
#pragma unroll
for (int j = 0; j < Q_ITEMS; j++) {
local_sum[j] += (float)(data[j][i].x + data[j][i].y);
// if (std::is_same<T2, half2>::value) {
local_sum[j] += (float)(data[j][i].data[0] + data[j][i].data[1]);
// } else {
// local_sum[j] += (float)(data[j][i].x + data[j][i].y);
// }
}
}
......@@ -1878,6 +1894,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
qk_scale);
}
else if (std::is_same<T, half>::value) {
printf("============xiabo_test %s:%d\n", __FILE__,__LINE__);
softmax_withRelPosBias_element2_kernel<half2, half>
<<<grid, block, 0, stream>>>((half2*)qk_buf,
(const half2*)attn_mask,
......
......@@ -13,12 +13,14 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
add_subdirectory(sampling_layers)
find_package(CUDAToolkit REQUIRED)
#find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)
add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart TopKSamplingLayer
#set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC cudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils)
......@@ -14,19 +14,23 @@
cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
#find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseSamplingLayer PUBLIC CUDA::cudart sampling_penalty_kernels memory_utils)
#set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseSamplingLayer PUBLIC cudart sampling_penalty_kernels memory_utils)
add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopKSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels)
#set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopKSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels)
add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopPSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
#set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopPSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
......@@ -2,9 +2,10 @@
cmake_minimum_required(VERSION 3.8)
add_subdirectory(fused_multi_head_attention)
#add_subdirectory(fused_multi_head_attention)
find_package(CUDAToolkit REQUIRED)
#find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)
add_library(Llama STATIC
LlamaV2.cc
......@@ -20,10 +21,12 @@ add_library(Llama STATIC
llama_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu)
set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC CUDA::cudart
gemm_s4_f16
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC cudart
# gemm_s4_f16
cublasMMWrapper
DynamicDecodeLayer
activation_kernels
......@@ -38,8 +41,8 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
memory_utils
nccl_utils
cuda_utils
logger
llama_fmha)
logger)
# llama_fmha)
if (NOT MSVC)
add_subdirectory(flash_attention2)
......@@ -47,5 +50,5 @@ if (NOT MSVC)
endif()
add_executable(llama_gemm llama_gemm.cc)
target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
target_link_libraries(llama_gemm PUBLIC cudart gpt_gemm_func memory_utils cuda_utils logger)
install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin)
......@@ -52,7 +52,8 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
k_buf_2_ = q_buf_2_ + local_head_num_ * batch_size * max_q_len * size_per_head_;
v_buf_2_ = k_buf_2_ + local_kv_head_num_ * batch_size * max_q_len * size_per_head_;
if (use_fmha_) {
// if (use_fmha_) {
if (0) {
FlashAttentionOp<T> flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
if (flash_attention.get_workspace_size() > 0) {
qk_buf_float_ = (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), true);
......@@ -86,7 +87,8 @@ void LlamaContextAttentionLayer<T>::freeBuffer()
allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&q_buf_2_));
if (use_fmha_) {
// if (use_fmha_) {
if (0) {
allocator_->free((void**)&qk_buf_float_);
}
else {
......@@ -209,7 +211,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
weights->past_kv_scale.data());
sync_check_cuda_error();
if (use_fmha_) {
// if (use_fmha_) {
if (0) {
fusedMultiHeadAttention(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
......@@ -252,7 +255,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
}
sync_check_cuda_error();
}
#if 0
template<typename T>
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
......@@ -311,7 +314,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
//
flash_attention(attn_params, stream_);
}
#endif
template<typename T>
void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
......
......@@ -22,10 +22,18 @@
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include <filesystem>
// #include <filesystem>
#include <experimental/filesystem>
#include <sys/stat.h>
#include <string>
namespace turbomind {
bool fileExists(const std::string& path) {
struct stat buffer;
return (stat(path.c_str(), &buffer) == 0);
}
template<typename T>
LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
size_t kv_head_num,
......@@ -129,7 +137,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
}
else {
// Disable slice if weight has already been sliced
if (std::filesystem::exists(max_prefix + ".weight") || std::filesystem::exists(max_prefix + ".qweight")) {
// if (std::filesystem::exists(max_prefix + ".weight") || std::filesystem::exists(max_prefix + ".qweight")) {
if (fileExists(max_prefix + ".weight") || fileExists(max_prefix + ".qweight")) {
TM_LOG_DEBUG("TP weight exists. Disable runtime TP.");
enable_slice = false;
}
......
......@@ -2,7 +2,7 @@
#pragma once
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
// #include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
......@@ -61,29 +61,29 @@ private:
void forwardInt4(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
{
if constexpr (std::is_same_v<T, half>) {
gemm_s4_f16_.Run(output_data,
(const uint*)weight.kernel,
input_data,
(const half2*)weight.scales_and_zeros,
weight.output_dims,
batch_size,
weight.input_dims,
weight.group_size,
type == kFusedSiluFfn ? GemmS4F16::kFusedSiluFfn : GemmS4F16::kGemm,
-1,
stream_);
sync_check_cuda_error();
}
else {
// if constexpr (std::is_same_v<T, half>) {
// gemm_s4_f16_.Run(output_data,
// (const uint*)weight.kernel,
// input_data,
// (const half2*)weight.scales_and_zeros,
// weight.output_dims,
// batch_size,
// weight.input_dims,
// weight.group_size,
// type == kFusedSiluFfn ? GemmS4F16::kFusedSiluFfn : GemmS4F16::kGemm,
// -1,
// stream_);
// sync_check_cuda_error();
// }
// else {
FT_CHECK_WITH_INFO(0, "Not implemented");
}
// }
}
private:
cublasMMWrapper* cublas_wrapper_;
cudaStream_t stream_{};
GemmS4F16 gemm_s4_f16_;
// GemmS4F16 gemm_s4_f16_;
};
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment