#include #include #include #include #include #include #include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "device_tensor.hpp" #include "check_err.hpp" #include "transpose_vectors.hpp" #include "common_header.hpp" using SrcDataType = ck::half_t; using DstDataType = ck::bhalf_t; __global__ void gpu_convert_data(SrcDataType* in, DstDataType* out, int size) { using namespace ck; ck::index_t num = blockIdx.x * blockDim.x + threadIdx.x * 2; const auto src_buf = ck::make_dynamic_buffer(in, size); auto dst_buf = ck::make_dynamic_buffer(out, size); auto src_data = src_buf.template Get(num, true); ck::bhalf2_t dst_data; convert_half2_to_bhalf2(src_data, dst_data); dst_buf.template Set(num, true, dst_data); } __global__ void gpu_transpose_convert_data(SrcDataType* in, DstDataType* out, const int size, const int stride) { using namespace ck; ck::index_t num = blockIdx.x * blockDim.x + threadIdx.x * 2; const auto src_buf = ck::make_dynamic_buffer(in, size); auto dst_buf = ck::make_dynamic_buffer(out, size); int x = num % stride; int y = num / stride; int num1 = (y + 1) * stride + x; auto src_data0 = src_buf.template Get(num, true); auto src_data1 = src_buf.template Get(num1, true); ck::bhalf2_t dst_data0, dst_data1; transpose_half_to_bhalf_2x2(src_data0, src_data1, dst_data0, dst_data1); // rewrite static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; const vector_type vx0{dst_data0}, vx1{dst_data1}; vector_type vy0, vy1; vy0.template AsType()(I0) = vx0.template AsType()[I0]; vy0.template AsType()(I1) = vx1.template AsType()[I0]; vy1.template AsType()(I0) = vx0.template AsType()[I1]; vy1.template AsType()(I1) = vx1.template AsType()[I1]; dst_buf.template Set(num, true, vy0.template AsType()[I0]); dst_buf.template Set(num1, true, vy1.template AsType()[I0]); } void host_convert_data(SrcDataType* in, DstDataType* out, size_t len) { for(int i = 0; i < len; i++) { out[i] = ck::type_convert(in[i]); } } int main(int, char*[]) { bool pass = true; constexpr int N = 4; constexpr int K = 4; constexpr int size = N * K; constexpr int thread_num = size / 2; // create tensor Tensor src_n_k_host( HostTensorDescriptor(std::vector({N, K}), std::vector({K, 1}))); Tensor dst_n_k_host_result( HostTensorDescriptor(std::vector({N, K}), std::vector({K, 1}))); Tensor dst_n_k_device_result( HostTensorDescriptor(std::vector({N, K}), std::vector({K, 1}))); // init data src_n_k_host.GenerateTensorValue(GeneratorTensor_3{-5, 5}); dst_n_k_host_result.GenerateTensorValue(GeneratorTensor_1{0}); dst_n_k_device_result.GenerateTensorValue(GeneratorTensor_1{0}); // alloc gpu memory DeviceMem in_dev_buf(sizeof(SrcDataType) * src_n_k_host.mDesc.GetElementSpace()); DeviceMem out_dev_buf(sizeof(DstDataType) * dst_n_k_host_result.mDesc.GetElementSpace()); // init gpu memory in_dev_buf.ToDevice(src_n_k_host.mData.data()); out_dev_buf.SetZero(); // run cpu data convert host_convert_data(src_n_k_host.mData.data(), dst_n_k_host_result.mData.data(), size); // run kernel to convert data gpu_convert_data<<<1, thread_num>>>(static_cast(in_dev_buf.GetDeviceBuffer()), static_cast(out_dev_buf.GetDeviceBuffer()), src_n_k_host.mDesc.GetElementSpace()); // read from gpu out_dev_buf.FromDevice(dst_n_k_device_result.mData.data()); pass = ck::utils::check_err(dst_n_k_device_result.mData, dst_n_k_host_result.mData); // run kernel to tanspos and convert data gpu_transpose_convert_data<<<1, thread_num / 2>>>( static_cast(in_dev_buf.GetDeviceBuffer()), static_cast(out_dev_buf.GetDeviceBuffer()), src_n_k_host.mDesc.GetElementSpace(), K); // read from gpu out_dev_buf.FromDevice(dst_n_k_device_result.mData.data()); pass &= ck::utils::check_err(dst_n_k_device_result.mData, dst_n_k_host_result.mData); #if 1 LogRangeAsType(std::cout << "in : ", src_n_k_host.mData, ",") << std::endl; LogRangeAsType(std::cout << "out device: ", dst_n_k_device_result.mData, ",") << std::endl; LogRangeAsType(std::cout << "out host: ", dst_n_k_host_result.mData, ",") << std::endl; #endif if(pass) { std::cout << "fp16 transfer to bf16: Pass" << std::endl; return 0; } else { std::cout << "fp16 transfer to bf16: Fail" << std::endl; return -1; } }