size_t GLOBAL_WORKSPACE_SIZE = 0; #include #include #include #include #include #include #include #include "include/ck/utility/print.hpp" #include "library/include/ck/library/utility/device_memory.hpp" #include "library/include/ck/library/utility/host_tensor.hpp" #include "library/include/ck/library/utility/host_tensor_generator.hpp" #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "include/ck/utility/reduction_operator.hpp" #include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" using layernorm_rank2_256_1_256_1_8_1_8_1_8_1_8_8 = ck::tensor_operation::device::DeviceLayernormImpl< ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, ck::tensor_operation::element_wise::PassThrough, 2, 1, 256, // block_size 1, // m_cluster_size 256, // k_cluster_size 1, // m_slice_size 8, // k_slice_size 1, // in_src_dim 8, // in_src_size 1, // gamma_src_dim 8, // gamma_src_size 1, // beta_src_dim 8, // beta_src_size 8 // out_dst_size >; using DeviceInstance = layernorm_rank2_256_1_256_1_8_1_8_1_8_1_8_8; void layernorm_0(void* input, void* gamma, void* beta, void* output, int64_t* in_0, int64_t* in_1, hipStream_t stream) { int M = *in_0; int N = *in_1; std::vector i_inStrides; i_inStrides.push_back(N); i_inStrides.push_back(1); auto device_instance = DeviceInstance{}; auto argument_ptr = device_instance.MakeArgumentPointer( {M, N}, i_inStrides, std::vector{0, 1}, std::vector{0, 1}, i_inStrides, {1}, 1e-05, static_cast(input), static_cast(gamma), static_cast(beta), static_cast(output), ck::tensor_operation::element_wise::PassThrough{} ); if(!device_instance.IsSupportedArgument(argument_ptr.get())) { throw std::runtime_error( "wrong! device_layernorm with the specified compilation parameters does " "not support this Softmax problem"); }; std::string instance_name = device_instance.GetTypeString(); auto invoker_ptr = device_instance.MakeInvokerPointer(); invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false}); return; } struct ProfilerMemoryPool { ProfilerMemoryPool() { std::random_device rd; gen = std::mt19937(rd()); uniform_dist = std::uniform_int_distribution(1, 48964896); offsets.reserve(512); strides.reserve(512); copies.reserve(512); ptrs.reserve(512); } ~ProfilerMemoryPool() { for(int i = 0; i < ptrs.size(); i++){ hipFree(ptrs[i]); } } template DType* AllocateGaussianTensor(int64_t size) { size_t length = size * sizeof(DType); DType *d_x; hipMalloc(&d_x, length); float mean = 0.0f; float stddev = 1.0f; uint64_t seed = uniform_dist(gen); rocrand_set_seed(generator, seed); rocrand_generate_normal(generator, reinterpret_cast(d_x), size, mean, stddev); return d_x; } ck::half_t* AllocateHalfGaussianTensor(int64_t size) { return reinterpret_cast( AllocateGaussianTensor(size)); } int AllocateHalfTensor(int64_t size, int64_t copy) { offsets.push_back(0); strides.push_back(size); copies.push_back(copy); auto ptr = AllocateHalfGaussianTensor(size * copy); ptrs.push_back(reinterpret_cast(ptr)); return ptrs.size() - 1; } ck::half_t* RequestHalfTensorByIdx(int idx) { auto copy = copies.at(idx); auto offset = offsets.at(idx); auto stride = strides.at(idx); ck::half_t* ptr = reinterpret_cast(ptrs.at(idx)); ptr += offset; offset += stride; if (offset == copy * stride) { offset = 0; } offsets[idx] = offset; return ptr; } std::vector offsets; std::vector strides; std::vector copies; std::vector ptrs; std::mt19937 gen; std::uniform_int_distribution uniform_dist; rocrand_generator generator; }; // hack for DeviceMem linking error // TODO fix this by making CK a header-only lib // <<< hack begin DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) { hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); } void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; } void DeviceMem::ToDevice(const void* p) const { hipGetErrorString( hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); } void DeviceMem::FromDevice(void* p) const { hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); } DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } struct KernelTimerImpl { KernelTimerImpl() { hipGetErrorString(hipEventCreate(&mStart)); hipGetErrorString(hipEventCreate(&mEnd)); } ~KernelTimerImpl() { hipGetErrorString(hipEventDestroy(mStart)); hipGetErrorString(hipEventDestroy(mEnd)); } void Start() { hipGetErrorString(hipDeviceSynchronize()); hipGetErrorString(hipEventRecord(mStart, nullptr)); } void End() { hipGetErrorString(hipEventRecord(mEnd, nullptr)); hipGetErrorString(hipEventSynchronize(mEnd)); } float GetElapsedTime() const { float time; hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd)); return time; } hipEvent_t mStart, mEnd; }; // >>> hack end int main(int argc, char** argv) { const int64_t in_0 = std::stoi(argv[1]); const int64_t in_1 = std::stoi(argv[2]); auto memory_pool = std::make_unique(); hipStream_t stream = nullptr; int64_t ptr_sz = in_0 * in_1; int64_t norm_dim = in_1; // TODO: special pool size for 8M L2 cache // need to tune it for other devices int64_t mem_pool_sz = std::max(2, std::min(64, int((1 << 23) / ptr_sz))); memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz); // in: index 0 memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz); // out: index 1 memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz); // gamma: index 2 memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz); // beta: index 3 // warmup for(int i = 0; i < 3; ++i) { layernorm_0( (void *) memory_pool->RequestHalfTensorByIdx(0), (void *) memory_pool->RequestHalfTensorByIdx(2), (void *) memory_pool->RequestHalfTensorByIdx(3), (void *) memory_pool->RequestHalfTensorByIdx(1), const_cast(&in_0), const_cast(&in_1), stream ); } // run KernelTimerImpl timer; timer.Start(); for(int i = 0; i < 5; ++i) { layernorm_0( (void *) memory_pool->RequestHalfTensorByIdx(0), (void *) memory_pool->RequestHalfTensorByIdx(2), (void *) memory_pool->RequestHalfTensorByIdx(3), (void *) memory_pool->RequestHalfTensorByIdx(1), const_cast(&in_0), const_cast(&in_1), stream ); } timer.End(); std::cout << "WS:" <