Unverified Commit 6bc9ee05 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Remove program server (#10)

* removing program server

* specify launch bound per kernel instance
parent f3baea0d
add_example_executable(example_hello_world hello_world.cpp)
add_example_executable(example_im2col im2col.cpp)
add_example_executable(example_gemm gemm.cpp)
add_example_executable(example_gemm_gemm gemm_gemm.cpp)
......
......@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -115,10 +114,11 @@ int main(int argc, char* argv[])
kGemmNPerBlock,
kGemmKPerBlock>{};
float ave_time = launch(ProgramServer{},
float ave_time = launch_kernel<kBlockSize, 2>(StreamConfig{nullptr, true},
gemm_kernel,
kGridSize,
kBlockSize,
0,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_buf.GetDeviceBuffer()),
......
......@@ -7,8 +7,8 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
......@@ -59,8 +59,7 @@ struct Gemm
using GridGemm = ck::tile_program::grid::GridGemm<Problem, Policy>;
__host__ __device__ void operator()(ProgramServer& ps,
const ADataType* p_a,
__device__ void operator()(const ADataType* p_a,
const BDataType* p_b,
CDataType* p_c,
ck::index_t M,
......@@ -87,12 +86,7 @@ struct Gemm
auto c_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_c, make_tuple(M, N), make_tuple(Ldc, 1), Number<32>{}, Number<1>{});
GridGemm{}(ps,
a_dram_grid,
b_dram_grid,
c_dram_grid,
a_element_func,
b_element_func,
c_element_func);
GridGemm{}(
a_dram_grid, b_dram_grid, c_dram_grid, a_element_func, b_element_func, c_element_func);
}
};
......@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -90,7 +89,8 @@ int main(int argc, char* argv[])
std::cout << "grid size " << kGridSize << std::endl;
float ave_time = launch(ProgramServer{},
float ave_time =
launch_kernel<kBlockSize, 2>(StreamConfig{nullptr, true},
GemmGemm<A0DataType,
B0DataType,
Acc0DataType,
......@@ -105,6 +105,7 @@ int main(int argc, char* argv[])
kN1PerBlock>{},
kGridSize,
kBlockSize,
0,
static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()),
static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()),
static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()),
......
......@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
......@@ -54,7 +53,7 @@ struct GemmGemm
#if 0
// 2d
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor()
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
using namespace ck;
......@@ -68,7 +67,7 @@ struct GemmGemm
}
#else
// fake XOR
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor()
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
using namespace ck;
......@@ -100,7 +99,7 @@ struct GemmGemm
}
#endif
__host__ __device__ static constexpr auto MakeB1DramTileDistribution()
__device__ static constexpr auto MakeB1DramTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -125,7 +124,7 @@ struct GemmGemm
Sequence<0, 1>>{});
}
__host__ __device__ static constexpr ck::index_t GetStaticLdsSize()
__device__ static constexpr ck::index_t GetStaticLdsSize()
{
using namespace ck;
......@@ -134,8 +133,7 @@ struct GemmGemm
sizeof(B1DataType)));
}
__host__ __device__ void operator()(ProgramServer& ps,
const A0DataType* p_a0,
__device__ void operator()(const A0DataType* p_a0,
const B0DataType* p_b0,
const B1DataType* p_b1,
C1DataType* p_c1,
......@@ -163,17 +161,17 @@ struct GemmGemm
p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{});
// divide problem
const auto id_block = ps.get_block_id();
const auto id_block = get_block_id();
const auto num_tile_m0 = M0 / kM0PerBlock;
const auto num_tile_n1 = N1 / kN1PerBlock;
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1)));
const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1));
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto iM0 = ps.read_first_lane(id_tile.At<0>() * kM0PerBlock);
const auto iN1 = ps.read_first_lane(id_tile.At<1>() * kN1PerBlock);
const auto iM0 = __builtin_amdgcn_readfirstlane(id_tile.At<0>() * kM0PerBlock);
const auto iN1 = __builtin_amdgcn_readfirstlane(id_tile.At<1>() * kN1PerBlock);
__shared__ char p_smem_char[GetStaticLdsSize()];
......@@ -233,18 +231,18 @@ struct GemmGemm
const auto b1_block_tile = load_tile(b1_dram_block_window);
// wait for block gemm0 pipeline to finish
ps.block_sync_lds();
block_sync_lds();
store_tile(b1_lds_block_window, b1_block_tile);
// wait for store_tile to finish
ps.block_sync_lds();
block_sync_lds();
// acc1 += c0 * b1
block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window);
// wait for block gemm1 to finish
ps.block_sync_lds();
block_sync_lds();
}
// move tile windows
......
......@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -73,14 +72,10 @@ int main(int argc, char* argv[])
ck::utils::FillUniformDistributionIntegerValue<A0DataType>{-3.f, 3.f}(a0_host);
ck::utils::FillUniformDistributionIntegerValue<B0DataType>{-3.f, 3.f}(b0_host);
ck::utils::FillUniformDistributionIntegerValue<B1DataType>{-3.f, 3.f}(b1_host);
#elif 0
#else
ck::utils::FillUniformDistribution<A0DataType>{-3.f, 3.f}(a0_host);
ck::utils::FillUniformDistribution<B0DataType>{-3.f, 3.f}(b0_host);
ck::utils::FillUniformDistribution<B1DataType>{-3.f, 3.f}(b1_host);
#else
ck::utils::FillConstant<A0DataType>{1.0f}(a0_host);
ck::utils::FillConstant<A0DataType>{1.0f}(b0_host);
ck::utils::FillConstant<A0DataType>{1.0f}(b1_host);
#endif
// reference
......@@ -107,7 +102,8 @@ int main(int argc, char* argv[])
std::cout << "grid size " << kGridSize << std::endl;
float ave_time = launch(ProgramServer{},
float ave_time =
launch_kernel<kBlockSize, 2>(StreamConfig{nullptr, true},
GemmSoftmaxGemm<A0DataType,
B0DataType,
Acc0DataType,
......@@ -122,6 +118,7 @@ int main(int argc, char* argv[])
kN1PerBlock>{},
kGridSize,
kBlockSize,
0,
static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()),
static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()),
static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()),
......
......@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
......@@ -19,8 +18,7 @@
#include "ck/tile_program/block_tile/block_reduce.hpp"
// C0 = A0 * B0
// D0 = softmax(C0)
// C1 = D0 * B1
// C1 = softmax(C0) * B1
template <typename A0DataType,
typename B0DataType,
typename Acc0DataType,
......@@ -57,7 +55,7 @@ struct GemmSoftmaxGemm
#if 0
// 2d
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor()
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
using namespace ck;
......@@ -71,7 +69,7 @@ struct GemmSoftmaxGemm
}
#else
// fake XOR
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor()
__device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
using namespace ck;
......@@ -103,7 +101,7 @@ struct GemmSoftmaxGemm
}
#endif
__host__ __device__ static constexpr auto MakeB1DramTileDistribution()
__device__ static constexpr auto MakeB1DramTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -128,7 +126,7 @@ struct GemmSoftmaxGemm
Sequence<0, 1>>{});
}
__host__ __device__ static constexpr ck::index_t GetStaticLdsSize()
__device__ static constexpr ck::index_t GetStaticLdsSize()
{
using namespace ck;
......@@ -137,8 +135,7 @@ struct GemmSoftmaxGemm
sizeof(B1DataType)));
}
__host__ __device__ void operator()(ProgramServer& ps,
const A0DataType* p_a0,
__device__ void operator()(const A0DataType* p_a0,
const B0DataType* p_b0,
const B1DataType* p_b1,
C1DataType* p_c1,
......@@ -169,17 +166,15 @@ struct GemmSoftmaxGemm
p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{});
// divide problem
const auto id_block = ps.get_block_id();
const auto num_tile_m0 = M0 / kM0PerBlock;
const auto num_tile_n1 = N1 / kN1PerBlock;
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1)));
const auto id_block = get_block_id();
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto id_tile_m = id_block / num_tile_n1;
const auto id_tile_n = id_block - id_tile_m * num_tile_n1;
const auto iM0 = ps.read_first_lane(id_tile.At<0>() * kM0PerBlock);
const auto iN1 = ps.read_first_lane(id_tile.At<1>() * kN1PerBlock);
const auto iM0 = __builtin_amdgcn_readfirstlane(id_tile_m * kM0PerBlock);
const auto iN1 = __builtin_amdgcn_readfirstlane(id_tile_n * kN1PerBlock);
__shared__ char p_smem_char[GetStaticLdsSize()];
......@@ -333,18 +328,18 @@ struct GemmSoftmaxGemm
const auto b1_block_tile = load_tile(b1_dram_block_window);
// wait for block gemm0 pipeline to finish
ps.block_sync_lds();
block_sync_lds();
store_tile(b1_lds_block_window, b1_block_tile);
// wait for store_tile to finish
ps.block_sync_lds();
block_sync_lds();
// acc1 += c0 * b1
block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window);
// wait for block gemm1 to finish
ps.block_sync_lds();
block_sync_lds();
}
// move tile windows
......
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "tile_program.hpp"
#include "ck/library/utility/device_memory.hpp"
// ProgramServer contains a "meta data buffer"
// host evaluate the expression inside ps(), and push the result into meta data buffer
// ProgramServer send meta data buffer to GPU as kernel arguement
// device read (not evaluate) the value of the expression inside ps() from meta data buffer
struct HelloWorld
{
__host__ __device__ void operator()(ProgramServer& ps, int x, int y, int* res)
{
#if 1
auto r0 = ps(x + y);
auto r1 = ps(x - y);
res[0] = r0;
res[1] = r1;
#elif 1
(void)x;
(void)y;
auto r0 = ps.get_thread_id();
auto r1 = ps.warp_shuffle_up(r0, 1);
auto r2 = ps.warp_shuffle_down(r0, 1);
printf("tid %d, r0 %d, r1 %d, r2 %d\n", ps.get_thread_id(), r0, r1, r2);
res[0] = r0;
res[1] = r2;
#endif
}
};
int main()
{
int x = 100;
int y = 101;
DeviceMem res_dev_buf(2 * sizeof(int));
launch(ProgramServer{},
HelloWorld{},
1,
64,
x,
y,
static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
int res_host[2];
res_dev_buf.FromDevice(&res_host);
printf("x+y=: %d\n", res_host[0]);
printf("x-y=: %d\n", res_host[1]);
return 0;
}
......@@ -9,10 +9,9 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
......@@ -105,10 +104,8 @@ struct Im2Col
Sequence<0, 1>>{});
}
template <typename Server>
__host__ __device__ void
operator()(Server& ps,
const std::array<ck::index_t, NDimSpatial + 2>& a_n_wis_c_lengths,
operator()(const std::array<ck::index_t, NDimSpatial + 2>& a_n_wis_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* a_n_wis_c_strides */,
const std::array<ck::index_t, NDimSpatial + 2>& b_k_xs_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* b_k_xs_c_strides */,
......@@ -118,10 +115,8 @@ struct Im2Col
const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NDimSpatial>& input_left_pads,
const std::array<ck::index_t, NDimSpatial>& input_right_pads,
//
const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths,
const std::array<ck::index_t, 2> a_gemmm_gemmk_strides,
//
const T* p_a_img,
T* p_a_mtx)
{
......@@ -176,8 +171,8 @@ struct Im2Col
const auto src_gemmm_gemmk =
transform_tensor_view(a_n_y_ho_x_wo_c,
make_tuple(ps(make_merge_transform(make_tuple(N, Ho, Wo))),
ps(make_merge_transform(make_tuple(Y, X, C)))),
make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
make_merge_transform(make_tuple(Y, X, C))),
make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
......@@ -191,15 +186,15 @@ struct Im2Col
const auto numGemmM = a_gemmm_gemmk_lengths[0];
const auto numGemmK = a_gemmm_gemmk_lengths[1];
const auto id_block = ps.get_block_id();
const auto id_block = get_block_id();
const auto num_tile_m = ps.read_first_lane(numGemmM / kMPerBlock);
const auto num_tile_m = __builtin_amdgcn_readfirstlane(numGemmM / kMPerBlock);
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m)));
const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m));
const auto i_gemmm_gemmk = block2tile.CalculateBottomIndex(make_multi_index(id_block));
const auto iGemmM = ps.read_first_lane(i_gemmm_gemmk[0]) * kMPerBlock;
const auto iGemmM = __builtin_amdgcn_readfirstlane(i_gemmm_gemmk[0]) * kMPerBlock;
// src window
auto src_block_window =
......@@ -327,10 +322,12 @@ int main()
ck::index_t kGridSize = (N * Ho * Wo) / kGemmMPerBlock;
float ave_time = launch(ProgramServer{},
float ave_time =
launch_kernel(StreamConfig{nullptr, true},
Im2Col<2, DataType, kBlockSize, kGemmMPerBlock, kGemmKPerBlock>{},
kGridSize,
kBlockSize,
0,
in_lengths,
in_strides,
wei_lengths,
......@@ -341,10 +338,8 @@ int main()
filter_dilations,
input_left_pads,
input_right_pads,
//
in_mtx_lengths,
in_mtx_strides,
//
static_cast<DataType*>(in_buf.GetDeviceBuffer()),
static_cast<DataType*>(in_mtx_buf.GetDeviceBuffer()));
......
......@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -39,9 +38,9 @@ void reference_reduce(const Tensor<ADataType>& a_m_n, Tensor<BDataType>& b_m)
int main(int argc, char* argv[])
{
using ADataType = float;
using ADataType = ck::half_t;
using AccDataType = float;
using BDataType = float;
using BDataType = ck::half_t;
ck::index_t M = 3328;
ck::index_t N = 4096;
......@@ -84,10 +83,11 @@ int main(int argc, char* argv[])
const auto kernel =
Reduce<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{};
float ave_time = launch(ProgramServer{},
float ave_time = launch_kernel(StreamConfig{nullptr, true},
kernel,
kGridSize,
kBlockSize,
0,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
M,
......
......@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
......@@ -25,7 +24,7 @@ template <typename ADataType,
struct Reduce
{
#if 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -40,7 +39,7 @@ struct Reduce
Sequence<0, 0, 2, 4>>{});
}
#elif 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -55,7 +54,7 @@ struct Reduce
Sequence<0, 0, 2, 4>>{});
}
#elif 1
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -71,8 +70,8 @@ struct Reduce
}
#endif
__host__ __device__ void operator()(
ProgramServer& ps, const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
__device__ void
operator()(const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
{
using namespace ck;
using namespace ck::tile_program;
......@@ -81,7 +80,7 @@ struct Reduce
const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{});
const auto iM = ps.get_block_id() * kMPerBlock;
const auto iM = get_block_id() * kMPerBlock;
// A window
auto a_block_window =
......
......@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -64,10 +63,11 @@ int main(int argc, char* argv[])
const auto kernel =
Softmax<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{};
float ave_time = launch(ProgramServer{},
float ave_time = launch_kernel(StreamConfig{nullptr, true},
kernel,
kGridSize,
kBlockSize,
0,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
M,
......@@ -81,8 +81,5 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
LogRangeAsType<float>(std::cout << "dev: ", b_host_dev.mData, ", ") << std::endl;
LogRangeAsType<float>(std::cout << "ref: ", b_host_ref.mData, ", ") << std::endl;
return !ck::utils::check_err(b_host_dev, b_host_ref);
}
......@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
......@@ -25,7 +24,7 @@ template <typename ADataType,
struct Softmax
{
#if 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -40,7 +39,7 @@ struct Softmax
Sequence<0, 0, 2, 4>>{});
}
#elif 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -55,7 +54,7 @@ struct Softmax
Sequence<0, 0, 2, 4>>{});
}
#elif 1
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
__device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
......@@ -71,8 +70,8 @@ struct Softmax
}
#endif
__host__ __device__ void operator()(
ProgramServer& ps, const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
__device__ void
operator()(const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
{
using namespace ck;
using namespace ck::tile_program;
......@@ -84,7 +83,7 @@ struct Softmax
const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{});
const auto iM = ps.get_block_id() * kMPerBlock;
const auto iM = get_block_id() * kMPerBlock;
// A window
auto a_block_window =
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
// Meta data for GPU
// TODO: do we need to take care of data alignment in code or it's done by compiler?
template <ck::index_t kSize>
struct MetaData
{
char p_data_[kSize];
ck::index_t size_ = 0;
ck::index_t pos_ = 0;
__host__ __device__ void reset()
{
size_ = 0;
pos_ = 0;
}
__device__ void reset_pos() { pos_ = 0; }
// push meta data on host
// TODO: correct forwarding?
template <typename T>
__host__ auto push(T&& a)
{
using Type = ck::remove_cvref_t<T>;
static_assert(std::is_trivially_copy_constructible_v<Type> &&
std::is_trivially_destructible_v<Type>);
assert(size_ + sizeof(Type) <= kSize);
// use placement new to create object copy
new(p_data_ + size_) Type(std::forward<T>(a));
size_ += sizeof(Type);
return ck::forwarder{}(a);
}
// pull meta data on device
// TODO: correct forwarding?
template <typename T>
__device__ auto pull()
{
using Type = ck::remove_cvref_t<T>;
static_assert(std::is_trivially_copy_constructible_v<Type> &&
std::is_trivially_destructible_v<Type>);
Type a(*reinterpret_cast<Type*>(p_data_ + pos_));
pos_ += sizeof(Type);
return a;
}
};
// namespace tp (for tile programming)
struct ProgramServer
{
// meta data on device
MetaData<1024> meta_data_;
__host__ void cpu_init() { meta_data_.reset(); }
__device__ void gpu_init() { meta_data_.reset_pos(); }
// push meta data on host
template <typename T>
__host__ auto operator()(T&& a)
{
return ck::forwarder{}(meta_data_.push(a));
}
// push meta data on host
template <typename T>
__device__ auto operator()(T&&)
{
return ck::forwarder{}(meta_data_.pull<T>());
}
//
__host__ static ck::index_t get_block_id() { return -1; }
__host__ static ck::index_t get_thread_id() { return -1; }
__host__ static ck::index_t get_grid_size() { return -1; }
__host__ static void block_sync_lds() {}
// TODO: correct forwarding?
template <typename T>
__host__ static constexpr auto read_first_lane(T&& a)
{
return ck::forwarder{}(a);
}
template <typename T>
__host__ T warp_shuffle_up(T, uint32_t)
{
return 0;
}
template <typename T>
__host__ T warp_shuffle_down(T, uint32_t)
{
return 0;
}
//
__device__ static ck::index_t get_block_id() { return ck::get_block_id(); }
__device__ static ck::index_t get_thread_id() { return ck::get_thread_id(); }
__device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
__device__ static void block_sync_lds() { ck::block_sync_lds(); }
template <typename T>
__device__ static constexpr auto read_first_lane(T&& a)
{
return __builtin_amdgcn_readfirstlane(a);
}
template <typename T>
__device__ T warp_shuffle_up(const T& var, uint32_t delta)
{
return ck::warp_shuffle_up(var, delta);
}
template <typename T>
__device__ T warp_shuffle_down(const T& var, uint32_t delta)
{
return ck::warp_shuffle_down(var, delta);
}
};
template <typename Server, typename Program, typename... Xs>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
__global__ void gpu_program_wrapper(Server server, Program f, Xs... xs)
{
server.gpu_init();
f(server, xs...);
}
template <typename Server, typename Program, typename... Xs>
float launch(Server server, Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
{
server.cpu_init();
f(server, xs...);
printf("meta data size %d\n", server.meta_data_.size_);
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
#if 0
gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
#else
return launch_and_time_kernel(StreamConfig{nullptr, true, 0},
gpu_program_wrapper<Server, Program, Xs...>,
grid_dim,
block_dim,
0,
server,
f,
xs...);
#endif
}
......@@ -9,6 +9,15 @@
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
#endif
__global__ void kernel_wrapper(Kernel f, Args... args)
{
f(args...);
}
template <typename... Args, typename F>
float launch_and_time_kernel(const StreamConfig& stream_config,
F kernel,
......@@ -142,3 +151,20 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
return 0;
#endif
}
template <int MaxThreadPerBlock = CK_MAX_THREAD_PER_BLOCK,
int MinBlockPerCu = CK_MIN_BLOCK_PER_CU,
typename KernelImpl,
typename... Args>
float launch_kernel(const StreamConfig& stream_config,
KernelImpl kernel_impl,
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
Args... args)
{
const auto kernel = kernel_wrapper<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;
return launch_and_time_kernel(
stream_config, kernel, grid_dim, block_dim, lds_byte, kernel_impl, args...);
}
......@@ -340,63 +340,6 @@ struct BlockGemmARegBSmemCRegV1
return c_block_tensor;
}
// FIXME: remove: dummy host function for tile programming
template <typename CBlockTensor, typename ABlockTensorTmp, typename BBlockWindowTmp>
__host__ void operator()(CBlockTensor&, const ABlockTensorTmp&, const BBlockWindowTmp&) const
{
}
// FIXME: remove: dummy host function for tile programming
template <typename ABlockTensorTmp, typename BBlockWindowTmp>
__host__ auto operator()(const ABlockTensorTmp&, const BBlockWindowTmp&) const
{
static_assert(is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>>,
"wrong!");
constexpr index_t MPerBlock = ABlockTensorTmp{}.GetLengths()[Number<0>{}];
constexpr index_t NPerBlock = BBlockWindowTmp{}.GetWindowLengths()[Number<0>{}];
static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN, "wrong!");
constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
using WG = remove_cvref_t<decltype(config.template At<0>())>;
constexpr index_t MWarp = config.template At<1>();
constexpr index_t NWarp = config.template At<2>();
constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
constexpr auto c_block_outer_dstr_encoding = StaticTileDistributionEncoding<
Sequence<>,
Tuple<Sequence<MIterPerWarp, MWarp>, Sequence<NIterPerWarp, NWarp>>,
Tuple<Sequence<1, 2>>,
Tuple<Sequence<1, 1>>,
Sequence<1, 2>,
Sequence<0, 0>>{};
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
#if 0
// FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent distribution
static_assert(
is_same_v<remove_cvref_t<decltype(a_block_dstr_encode)>,
remove_cvref_t<decltype(
ABlockTensorTmp::GetBlockDistribution().GetStaticTensorDistributionEncoding())>>,
"wrong!");
#endif
// Construct C-Block-Tensor
auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
return c_block_tensor;
}
};
} // namespace block
......
......@@ -331,55 +331,6 @@ struct BlockGemmASmemBSmemCRegV1
return c_block_tensor;
}
// FIXME: remove: dummy host function for tile programming
template <typename CBlockTensor, typename ABlockWindowTmp, typename BBlockWindowTmp>
__host__ void operator()(CBlockTensor&, const ABlockWindowTmp&, const BBlockWindowTmp&) const
{
}
// FIXME: remove: dummy host function for tile programming
template <typename ABlockWindowTmp, typename BBlockWindowTmp>
__host__ auto operator()(const ABlockWindowTmp&, const BBlockWindowTmp&) const
{
static_assert(is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
is_same_v<BDataType, typename BBlockWindowTmp::DataType>,
"wrong!");
constexpr index_t MPerBlock = ABlockWindowTmp{}.GetWindowLengths()[Number<0>{}];
constexpr index_t NPerBlock = BBlockWindowTmp{}.GetWindowLengths()[Number<0>{}];
static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN, "wrong!");
constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
using WG = remove_cvref_t<decltype(config.template At<0>())>;
constexpr index_t MWarp = config.template At<1>();
constexpr index_t NWarp = config.template At<2>();
constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
constexpr auto c_block_outer_dstr_encoding = StaticTileDistributionEncoding<
Sequence<>,
Tuple<Sequence<MIterPerWarp, MWarp>, Sequence<NIterPerWarp, NWarp>>,
Tuple<Sequence<1, 2>>,
Tuple<Sequence<1, 1>>,
Sequence<1, 2>,
Sequence<0, 0>>{};
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
static_assert(is_same_v<CDataType, typename WG::CDataType>, "wrong!");
auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
return c_block_tensor;
}
};
} // namespace block
......
......@@ -210,52 +210,6 @@ __device__ auto block_tile_reduce(const InDistributedTensor_& in_tensor,
return acc_tensor;
}
// FIXME: dummy host function for tile program
template <typename AccDistributedTensor_,
typename InDistributedTensor_,
index_t... InReduceDims,
typename ReduceFunc>
__host__ void block_tile_reduce(AccDistributedTensor_&,
const InDistributedTensor_&,
Sequence<InReduceDims...>,
const ReduceFunc&)
{
}
// FIXME: dummy host function for tile program
template <typename AccDataType_,
typename InDistributedTensor_,
index_t... InReduceDims,
typename ReduceFunc,
typename InDataType_>
__host__ auto block_tile_reduce(const InDistributedTensor_&,
Sequence<InReduceDims...>,
const ReduceFunc&,
const InDataType_&)
{
using InDataType = typename InDistributedTensor_::DataType;
using AccDataType = remove_cvref_t<AccDataType_>;
static_assert(is_same_v<InDataType, remove_cvref_t<InDataType_>>, "wrong!");
// declare acc_tensor
constexpr auto acc_dstr = make_static_tile_distribution(
ck::tile_program::detail::make_reduce_tile_distribution_encoding(
InDistributedTensor_::GetTileDistribution().GetStaticTileDistributionEncoding(),
Sequence<InReduceDims...>{}));
auto acc_tensor = make_static_distributed_tensor<AccDataType>(acc_dstr);
return acc_tensor;
}
// FIXME: dummy host function for tile program
template <typename AccDistributedTensor_, typename ReduceFunc>
__host__ void block_tile_reduce_sync(AccDistributedTensor_&, const ReduceFunc&)
{
}
} // namespace block
} // namespace tile_program
} // namespace ck
......@@ -159,12 +159,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
a_block_tile = load_tile(a_copy_dram_window);
b_block_tile = load_tile(b_copy_dram_window);
ProgramServer::block_sync_lds();
block_sync_lds();
// GEMM i
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
ProgramServer::block_sync_lds();
block_sync_lds();
// move to i + 2
move_tile_window(a_copy_dram_window, {0, kKPerBlock});
......@@ -184,7 +184,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
// tail
{
ProgramServer::block_sync_lds();
block_sync_lds();
// GEMM num_loop - 1
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
......@@ -194,7 +194,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
}
template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
__host__ __device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
__device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
const BDramBlockWindowTmp& b_dram_block_window_tmp,
index_t num_loop,
void* p_smem) const
......
......@@ -161,12 +161,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
do
{
ProgramServer::block_sync_lds();
block_sync_lds();
// GEMM i
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
ProgramServer::block_sync_lds();
block_sync_lds();
// move to i + 2
move_tile_window(a_copy_dram_window, {0, kKPerBlock});
......@@ -190,12 +190,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
// tail
{
ProgramServer::block_sync_lds();
block_sync_lds();
// GEMM num_loop - 2
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
ProgramServer::block_sync_lds();
block_sync_lds();
// LDS write num_loop - 1
const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
......@@ -204,7 +204,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
store_tile(b_copy_lds_window, b_block_tile_tmp);
ProgramServer::block_sync_lds();
block_sync_lds();
// GEMM num_loop - 1
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
......@@ -214,7 +214,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
}
template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
__host__ __device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
__device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
const BDramBlockWindowTmp& b_dram_block_window_tmp,
index_t num_loop,
void* p_smem) const
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment