Unverified Commit 6bc9ee05 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Remove program server (#10)

* removing program server

* specify launch bound per kernel instance
parent f3baea0d
add_example_executable(example_hello_world hello_world.cpp)
add_example_executable(example_im2col im2col.cpp) add_example_executable(example_im2col im2col.cpp)
add_example_executable(example_gemm gemm.cpp) add_example_executable(example_gemm gemm.cpp)
add_example_executable(example_gemm_gemm gemm_gemm.cpp) add_example_executable(example_gemm_gemm gemm_gemm.cpp)
......
...@@ -4,9 +4,8 @@ ...@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp" #include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -115,22 +114,23 @@ int main(int argc, char* argv[]) ...@@ -115,22 +114,23 @@ int main(int argc, char* argv[])
kGemmNPerBlock, kGemmNPerBlock,
kGemmKPerBlock>{}; kGemmKPerBlock>{};
float ave_time = launch(ProgramServer{}, float ave_time = launch_kernel<kBlockSize, 2>(StreamConfig{nullptr, true},
gemm_kernel, gemm_kernel,
kGridSize, kGridSize,
kBlockSize, kBlockSize,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()), 0,
static_cast<BDataType*>(b_buf.GetDeviceBuffer()), static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_buf.GetDeviceBuffer()), static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
M, static_cast<CDataType*>(c_buf.GetDeviceBuffer()),
N, M,
K, N,
K, K,
K, K,
N, K,
AElementFunction{}, N,
BElementFunction{}, AElementFunction{},
CElementFunction{}); BElementFunction{},
CElementFunction{});
c_buf.FromDevice(c_host_dev.mData.data()); c_buf.FromDevice(c_host_dev.mData.data());
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
#include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp" #include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp" #include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp" #include "ck/tile_program/tile/tile_gemm_shape.hpp"
...@@ -59,19 +59,18 @@ struct Gemm ...@@ -59,19 +59,18 @@ struct Gemm
using GridGemm = ck::tile_program::grid::GridGemm<Problem, Policy>; using GridGemm = ck::tile_program::grid::GridGemm<Problem, Policy>;
__host__ __device__ void operator()(ProgramServer& ps, __device__ void operator()(const ADataType* p_a,
const ADataType* p_a, const BDataType* p_b,
const BDataType* p_b, CDataType* p_c,
CDataType* p_c, ck::index_t M,
ck::index_t M, ck::index_t N,
ck::index_t N, ck::index_t K,
ck::index_t K, ck::index_t Lda,
ck::index_t Lda, ck::index_t Ldb,
ck::index_t Ldb, ck::index_t Ldc,
ck::index_t Ldc, const AElementFunction& a_element_func,
const AElementFunction& a_element_func, const BElementFunction& b_element_func,
const BElementFunction& b_element_func, const CElementFunction& c_element_func) const
const CElementFunction& c_element_func) const
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -87,12 +86,7 @@ struct Gemm ...@@ -87,12 +86,7 @@ struct Gemm
auto c_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>( auto c_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_c, make_tuple(M, N), make_tuple(Ldc, 1), Number<32>{}, Number<1>{}); p_c, make_tuple(M, N), make_tuple(Ldc, 1), Number<32>{}, Number<1>{});
GridGemm{}(ps, GridGemm{}(
a_dram_grid, a_dram_grid, b_dram_grid, c_dram_grid, a_element_func, b_element_func, c_element_func);
b_dram_grid,
c_dram_grid,
a_element_func,
b_element_func,
c_element_func);
} }
}; };
...@@ -4,9 +4,8 @@ ...@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp" #include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -90,33 +89,35 @@ int main(int argc, char* argv[]) ...@@ -90,33 +89,35 @@ int main(int argc, char* argv[])
std::cout << "grid size " << kGridSize << std::endl; std::cout << "grid size " << kGridSize << std::endl;
float ave_time = launch(ProgramServer{}, float ave_time =
GemmGemm<A0DataType, launch_kernel<kBlockSize, 2>(StreamConfig{nullptr, true},
B0DataType, GemmGemm<A0DataType,
Acc0DataType, B0DataType,
C0DataType, Acc0DataType,
B1DataType, C0DataType,
Acc1DataType, B1DataType,
C1DataType, Acc1DataType,
C1DataType,
kBlockSize,
kM0PerBlock,
kN0PerBlock,
kK0PerBlock,
kN1PerBlock>{},
kGridSize,
kBlockSize, kBlockSize,
kM0PerBlock, 0,
kN0PerBlock, static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()),
kK0PerBlock, static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()),
kN1PerBlock>{}, static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()),
kGridSize, static_cast<C1DataType*>(c1_buf.GetDeviceBuffer()),
kBlockSize, M0,
static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()), N0,
static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()), K0,
static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()), N1,
static_cast<C1DataType*>(c1_buf.GetDeviceBuffer()), K0, // Lda0
M0, K0, // Ldb0
N0, N0, // Ldb1
K0, N1); // Ldc1
N1,
K0, // Lda0
K0, // Ldb0
N0, // Ldb1
N1); // Ldc1
c1_buf.FromDevice(c1_host_dev.mData.data()); c1_buf.FromDevice(c1_host_dev.mData.data());
......
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp" #include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp" #include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp" #include "ck/tile_program/tile/tile_gemm_shape.hpp"
...@@ -54,7 +53,7 @@ struct GemmGemm ...@@ -54,7 +53,7 @@ struct GemmGemm
#if 0 #if 0
// 2d // 2d
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor() __device__ static constexpr auto MakeB1LdsBlockDescriptor()
{ {
using namespace ck; using namespace ck;
...@@ -68,7 +67,7 @@ struct GemmGemm ...@@ -68,7 +67,7 @@ struct GemmGemm
} }
#else #else
// fake XOR // fake XOR
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor() __device__ static constexpr auto MakeB1LdsBlockDescriptor()
{ {
using namespace ck; using namespace ck;
...@@ -100,7 +99,7 @@ struct GemmGemm ...@@ -100,7 +99,7 @@ struct GemmGemm
} }
#endif #endif
__host__ __device__ static constexpr auto MakeB1DramTileDistribution() __device__ static constexpr auto MakeB1DramTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -125,7 +124,7 @@ struct GemmGemm ...@@ -125,7 +124,7 @@ struct GemmGemm
Sequence<0, 1>>{}); Sequence<0, 1>>{});
} }
__host__ __device__ static constexpr ck::index_t GetStaticLdsSize() __device__ static constexpr ck::index_t GetStaticLdsSize()
{ {
using namespace ck; using namespace ck;
...@@ -134,19 +133,18 @@ struct GemmGemm ...@@ -134,19 +133,18 @@ struct GemmGemm
sizeof(B1DataType))); sizeof(B1DataType)));
} }
__host__ __device__ void operator()(ProgramServer& ps, __device__ void operator()(const A0DataType* p_a0,
const A0DataType* p_a0, const B0DataType* p_b0,
const B0DataType* p_b0, const B1DataType* p_b1,
const B1DataType* p_b1, C1DataType* p_c1,
C1DataType* p_c1, ck::index_t M0,
ck::index_t M0, ck::index_t N0,
ck::index_t N0, ck::index_t K0,
ck::index_t K0, ck::index_t N1,
ck::index_t N1, ck::index_t Lda0,
ck::index_t Lda0, ck::index_t Ldb0,
ck::index_t Ldb0, ck::index_t Ldb1,
ck::index_t Ldb1, ck::index_t Ldc1)
ck::index_t Ldc1)
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -163,17 +161,17 @@ struct GemmGemm ...@@ -163,17 +161,17 @@ struct GemmGemm
p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{}); p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{});
// divide problem // divide problem
const auto id_block = ps.get_block_id(); const auto id_block = get_block_id();
const auto num_tile_m0 = M0 / kM0PerBlock; const auto num_tile_m0 = M0 / kM0PerBlock;
const auto num_tile_n1 = N1 / kN1PerBlock; const auto num_tile_n1 = N1 / kN1PerBlock;
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1))); const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1));
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block)); const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto iM0 = ps.read_first_lane(id_tile.At<0>() * kM0PerBlock); const auto iM0 = __builtin_amdgcn_readfirstlane(id_tile.At<0>() * kM0PerBlock);
const auto iN1 = ps.read_first_lane(id_tile.At<1>() * kN1PerBlock); const auto iN1 = __builtin_amdgcn_readfirstlane(id_tile.At<1>() * kN1PerBlock);
__shared__ char p_smem_char[GetStaticLdsSize()]; __shared__ char p_smem_char[GetStaticLdsSize()];
...@@ -233,18 +231,18 @@ struct GemmGemm ...@@ -233,18 +231,18 @@ struct GemmGemm
const auto b1_block_tile = load_tile(b1_dram_block_window); const auto b1_block_tile = load_tile(b1_dram_block_window);
// wait for block gemm0 pipeline to finish // wait for block gemm0 pipeline to finish
ps.block_sync_lds(); block_sync_lds();
store_tile(b1_lds_block_window, b1_block_tile); store_tile(b1_lds_block_window, b1_block_tile);
// wait for store_tile to finish // wait for store_tile to finish
ps.block_sync_lds(); block_sync_lds();
// acc1 += c0 * b1 // acc1 += c0 * b1
block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window); block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window);
// wait for block gemm1 to finish // wait for block gemm1 to finish
ps.block_sync_lds(); block_sync_lds();
} }
// move tile windows // move tile windows
......
...@@ -4,9 +4,8 @@ ...@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp" #include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -73,14 +72,10 @@ int main(int argc, char* argv[]) ...@@ -73,14 +72,10 @@ int main(int argc, char* argv[])
ck::utils::FillUniformDistributionIntegerValue<A0DataType>{-3.f, 3.f}(a0_host); ck::utils::FillUniformDistributionIntegerValue<A0DataType>{-3.f, 3.f}(a0_host);
ck::utils::FillUniformDistributionIntegerValue<B0DataType>{-3.f, 3.f}(b0_host); ck::utils::FillUniformDistributionIntegerValue<B0DataType>{-3.f, 3.f}(b0_host);
ck::utils::FillUniformDistributionIntegerValue<B1DataType>{-3.f, 3.f}(b1_host); ck::utils::FillUniformDistributionIntegerValue<B1DataType>{-3.f, 3.f}(b1_host);
#elif 0 #else
ck::utils::FillUniformDistribution<A0DataType>{-3.f, 3.f}(a0_host); ck::utils::FillUniformDistribution<A0DataType>{-3.f, 3.f}(a0_host);
ck::utils::FillUniformDistribution<B0DataType>{-3.f, 3.f}(b0_host); ck::utils::FillUniformDistribution<B0DataType>{-3.f, 3.f}(b0_host);
ck::utils::FillUniformDistribution<B1DataType>{-3.f, 3.f}(b1_host); ck::utils::FillUniformDistribution<B1DataType>{-3.f, 3.f}(b1_host);
#else
ck::utils::FillConstant<A0DataType>{1.0f}(a0_host);
ck::utils::FillConstant<A0DataType>{1.0f}(b0_host);
ck::utils::FillConstant<A0DataType>{1.0f}(b1_host);
#endif #endif
// reference // reference
...@@ -107,33 +102,35 @@ int main(int argc, char* argv[]) ...@@ -107,33 +102,35 @@ int main(int argc, char* argv[])
std::cout << "grid size " << kGridSize << std::endl; std::cout << "grid size " << kGridSize << std::endl;
float ave_time = launch(ProgramServer{}, float ave_time =
GemmSoftmaxGemm<A0DataType, launch_kernel<kBlockSize, 2>(StreamConfig{nullptr, true},
B0DataType, GemmSoftmaxGemm<A0DataType,
Acc0DataType, B0DataType,
C0DataType, Acc0DataType,
B1DataType, C0DataType,
Acc1DataType, B1DataType,
C1DataType, Acc1DataType,
kBlockSize, C1DataType,
kM0PerBlock, kBlockSize,
kN0PerBlock, kM0PerBlock,
kK0PerBlock, kN0PerBlock,
kN1PerBlock>{}, kK0PerBlock,
kGridSize, kN1PerBlock>{},
kBlockSize, kGridSize,
static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()), kBlockSize,
static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()), 0,
static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()), static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()),
static_cast<C1DataType*>(c1_buf.GetDeviceBuffer()), static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()),
M0, static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()),
N0, static_cast<C1DataType*>(c1_buf.GetDeviceBuffer()),
K0, M0,
N1, N0,
K0, // Lda0 K0,
K0, // Ldb0 N1,
N0, // Ldb1 K0, // Lda0
N1); // Ldc1 K0, // Ldb0
N0, // Ldb1
N1); // Ldc1
c1_buf.FromDevice(c1_host_dev.mData.data()); c1_buf.FromDevice(c1_host_dev.mData.data());
......
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp" #include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp" #include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp" #include "ck/tile_program/tile/tile_gemm_shape.hpp"
...@@ -19,8 +18,7 @@ ...@@ -19,8 +18,7 @@
#include "ck/tile_program/block_tile/block_reduce.hpp" #include "ck/tile_program/block_tile/block_reduce.hpp"
// C0 = A0 * B0 // C0 = A0 * B0
// D0 = softmax(C0) // C1 = softmax(C0) * B1
// C1 = D0 * B1
template <typename A0DataType, template <typename A0DataType,
typename B0DataType, typename B0DataType,
typename Acc0DataType, typename Acc0DataType,
...@@ -57,7 +55,7 @@ struct GemmSoftmaxGemm ...@@ -57,7 +55,7 @@ struct GemmSoftmaxGemm
#if 0 #if 0
// 2d // 2d
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor() __device__ static constexpr auto MakeB1LdsBlockDescriptor()
{ {
using namespace ck; using namespace ck;
...@@ -71,7 +69,7 @@ struct GemmSoftmaxGemm ...@@ -71,7 +69,7 @@ struct GemmSoftmaxGemm
} }
#else #else
// fake XOR // fake XOR
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor() __device__ static constexpr auto MakeB1LdsBlockDescriptor()
{ {
using namespace ck; using namespace ck;
...@@ -103,7 +101,7 @@ struct GemmSoftmaxGemm ...@@ -103,7 +101,7 @@ struct GemmSoftmaxGemm
} }
#endif #endif
__host__ __device__ static constexpr auto MakeB1DramTileDistribution() __device__ static constexpr auto MakeB1DramTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -128,7 +126,7 @@ struct GemmSoftmaxGemm ...@@ -128,7 +126,7 @@ struct GemmSoftmaxGemm
Sequence<0, 1>>{}); Sequence<0, 1>>{});
} }
__host__ __device__ static constexpr ck::index_t GetStaticLdsSize() __device__ static constexpr ck::index_t GetStaticLdsSize()
{ {
using namespace ck; using namespace ck;
...@@ -137,19 +135,18 @@ struct GemmSoftmaxGemm ...@@ -137,19 +135,18 @@ struct GemmSoftmaxGemm
sizeof(B1DataType))); sizeof(B1DataType)));
} }
__host__ __device__ void operator()(ProgramServer& ps, __device__ void operator()(const A0DataType* p_a0,
const A0DataType* p_a0, const B0DataType* p_b0,
const B0DataType* p_b0, const B1DataType* p_b1,
const B1DataType* p_b1, C1DataType* p_c1,
C1DataType* p_c1, ck::index_t M0,
ck::index_t M0, ck::index_t N0,
ck::index_t N0, ck::index_t K0,
ck::index_t K0, ck::index_t N1,
ck::index_t N1, ck::index_t Lda0,
ck::index_t Lda0, ck::index_t Ldb0,
ck::index_t Ldb0, ck::index_t Ldb1,
ck::index_t Ldb1, ck::index_t Ldc1)
ck::index_t Ldc1)
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -169,17 +166,15 @@ struct GemmSoftmaxGemm ...@@ -169,17 +166,15 @@ struct GemmSoftmaxGemm
p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{}); p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{});
// divide problem // divide problem
const auto id_block = ps.get_block_id();
const auto num_tile_m0 = M0 / kM0PerBlock;
const auto num_tile_n1 = N1 / kN1PerBlock; const auto num_tile_n1 = N1 / kN1PerBlock;
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1))); const auto id_block = get_block_id();
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block)); const auto id_tile_m = id_block / num_tile_n1;
const auto id_tile_n = id_block - id_tile_m * num_tile_n1;
const auto iM0 = ps.read_first_lane(id_tile.At<0>() * kM0PerBlock); const auto iM0 = __builtin_amdgcn_readfirstlane(id_tile_m * kM0PerBlock);
const auto iN1 = ps.read_first_lane(id_tile.At<1>() * kN1PerBlock); const auto iN1 = __builtin_amdgcn_readfirstlane(id_tile_n * kN1PerBlock);
__shared__ char p_smem_char[GetStaticLdsSize()]; __shared__ char p_smem_char[GetStaticLdsSize()];
...@@ -333,18 +328,18 @@ struct GemmSoftmaxGemm ...@@ -333,18 +328,18 @@ struct GemmSoftmaxGemm
const auto b1_block_tile = load_tile(b1_dram_block_window); const auto b1_block_tile = load_tile(b1_dram_block_window);
// wait for block gemm0 pipeline to finish // wait for block gemm0 pipeline to finish
ps.block_sync_lds(); block_sync_lds();
store_tile(b1_lds_block_window, b1_block_tile); store_tile(b1_lds_block_window, b1_block_tile);
// wait for store_tile to finish // wait for store_tile to finish
ps.block_sync_lds(); block_sync_lds();
// acc1 += c0 * b1 // acc1 += c0 * b1
block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window); block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window);
// wait for block gemm1 to finish // wait for block gemm1 to finish
ps.block_sync_lds(); block_sync_lds();
} }
// move tile windows // move tile windows
......
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "tile_program.hpp"
#include "ck/library/utility/device_memory.hpp"
// ProgramServer contains a "meta data buffer"
// host evaluate the expression inside ps(), and push the result into meta data buffer
// ProgramServer send meta data buffer to GPU as kernel arguement
// device read (not evaluate) the value of the expression inside ps() from meta data buffer
struct HelloWorld
{
__host__ __device__ void operator()(ProgramServer& ps, int x, int y, int* res)
{
#if 1
auto r0 = ps(x + y);
auto r1 = ps(x - y);
res[0] = r0;
res[1] = r1;
#elif 1
(void)x;
(void)y;
auto r0 = ps.get_thread_id();
auto r1 = ps.warp_shuffle_up(r0, 1);
auto r2 = ps.warp_shuffle_down(r0, 1);
printf("tid %d, r0 %d, r1 %d, r2 %d\n", ps.get_thread_id(), r0, r1, r2);
res[0] = r0;
res[1] = r2;
#endif
}
};
int main()
{
int x = 100;
int y = 101;
DeviceMem res_dev_buf(2 * sizeof(int));
launch(ProgramServer{},
HelloWorld{},
1,
64,
x,
y,
static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
int res_host[2];
res_dev_buf.FromDevice(&res_host);
printf("x+y=: %d\n", res_host[0]);
printf("x-y=: %d\n", res_host[1]);
return 0;
}
...@@ -9,10 +9,9 @@ ...@@ -9,10 +9,9 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp" #include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp" #include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp" #include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp" #include "ck/tile_program/tile/load_tile.hpp"
...@@ -105,10 +104,8 @@ struct Im2Col ...@@ -105,10 +104,8 @@ struct Im2Col
Sequence<0, 1>>{}); Sequence<0, 1>>{});
} }
template <typename Server>
__host__ __device__ void __host__ __device__ void
operator()(Server& ps, operator()(const std::array<ck::index_t, NDimSpatial + 2>& a_n_wis_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& a_n_wis_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* a_n_wis_c_strides */, const std::array<ck::index_t, NDimSpatial + 2>& /* a_n_wis_c_strides */,
const std::array<ck::index_t, NDimSpatial + 2>& b_k_xs_c_lengths, const std::array<ck::index_t, NDimSpatial + 2>& b_k_xs_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* b_k_xs_c_strides */, const std::array<ck::index_t, NDimSpatial + 2>& /* b_k_xs_c_strides */,
...@@ -118,10 +115,8 @@ struct Im2Col ...@@ -118,10 +115,8 @@ struct Im2Col
const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations, const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NDimSpatial>& input_left_pads, const std::array<ck::index_t, NDimSpatial>& input_left_pads,
const std::array<ck::index_t, NDimSpatial>& input_right_pads, const std::array<ck::index_t, NDimSpatial>& input_right_pads,
//
const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths, const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths,
const std::array<ck::index_t, 2> a_gemmm_gemmk_strides, const std::array<ck::index_t, 2> a_gemmm_gemmk_strides,
//
const T* p_a_img, const T* p_a_img,
T* p_a_mtx) T* p_a_mtx)
{ {
...@@ -176,8 +171,8 @@ struct Im2Col ...@@ -176,8 +171,8 @@ struct Im2Col
const auto src_gemmm_gemmk = const auto src_gemmm_gemmk =
transform_tensor_view(a_n_y_ho_x_wo_c, transform_tensor_view(a_n_y_ho_x_wo_c,
make_tuple(ps(make_merge_transform(make_tuple(N, Ho, Wo))), make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
ps(make_merge_transform(make_tuple(Y, X, C)))), make_merge_transform(make_tuple(Y, X, C))),
make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}), make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
...@@ -191,15 +186,15 @@ struct Im2Col ...@@ -191,15 +186,15 @@ struct Im2Col
const auto numGemmM = a_gemmm_gemmk_lengths[0]; const auto numGemmM = a_gemmm_gemmk_lengths[0];
const auto numGemmK = a_gemmm_gemmk_lengths[1]; const auto numGemmK = a_gemmm_gemmk_lengths[1];
const auto id_block = ps.get_block_id(); const auto id_block = get_block_id();
const auto num_tile_m = ps.read_first_lane(numGemmM / kMPerBlock); const auto num_tile_m = __builtin_amdgcn_readfirstlane(numGemmM / kMPerBlock);
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m))); const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m));
const auto i_gemmm_gemmk = block2tile.CalculateBottomIndex(make_multi_index(id_block)); const auto i_gemmm_gemmk = block2tile.CalculateBottomIndex(make_multi_index(id_block));
const auto iGemmM = ps.read_first_lane(i_gemmm_gemmk[0]) * kMPerBlock; const auto iGemmM = __builtin_amdgcn_readfirstlane(i_gemmm_gemmk[0]) * kMPerBlock;
// src window // src window
auto src_block_window = auto src_block_window =
...@@ -327,26 +322,26 @@ int main() ...@@ -327,26 +322,26 @@ int main()
ck::index_t kGridSize = (N * Ho * Wo) / kGemmMPerBlock; ck::index_t kGridSize = (N * Ho * Wo) / kGemmMPerBlock;
float ave_time = launch(ProgramServer{}, float ave_time =
Im2Col<2, DataType, kBlockSize, kGemmMPerBlock, kGemmKPerBlock>{}, launch_kernel(StreamConfig{nullptr, true},
kGridSize, Im2Col<2, DataType, kBlockSize, kGemmMPerBlock, kGemmKPerBlock>{},
kBlockSize, kGridSize,
in_lengths, kBlockSize,
in_strides, 0,
wei_lengths, in_lengths,
wei_strides, in_strides,
out_lengths, wei_lengths,
out_strides, wei_strides,
filter_strides, out_lengths,
filter_dilations, out_strides,
input_left_pads, filter_strides,
input_right_pads, filter_dilations,
// input_left_pads,
in_mtx_lengths, input_right_pads,
in_mtx_strides, in_mtx_lengths,
// in_mtx_strides,
static_cast<DataType*>(in_buf.GetDeviceBuffer()), static_cast<DataType*>(in_buf.GetDeviceBuffer()),
static_cast<DataType*>(in_mtx_buf.GetDeviceBuffer())); static_cast<DataType*>(in_mtx_buf.GetDeviceBuffer()));
std::size_t num_btype = sizeof(DataType) * in_host.GetElementSize() + std::size_t num_btype = sizeof(DataType) * in_host.GetElementSize() +
sizeof(DataType) * in_mtx_host_ref.GetElementSize(); sizeof(DataType) * in_mtx_host_ref.GetElementSize();
......
...@@ -4,9 +4,8 @@ ...@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp" #include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -39,9 +38,9 @@ void reference_reduce(const Tensor<ADataType>& a_m_n, Tensor<BDataType>& b_m) ...@@ -39,9 +38,9 @@ void reference_reduce(const Tensor<ADataType>& a_m_n, Tensor<BDataType>& b_m)
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using ADataType = float; using ADataType = ck::half_t;
using AccDataType = float; using AccDataType = float;
using BDataType = float; using BDataType = ck::half_t;
ck::index_t M = 3328; ck::index_t M = 3328;
ck::index_t N = 4096; ck::index_t N = 4096;
...@@ -84,14 +83,15 @@ int main(int argc, char* argv[]) ...@@ -84,14 +83,15 @@ int main(int argc, char* argv[])
const auto kernel = const auto kernel =
Reduce<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{}; Reduce<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{};
float ave_time = launch(ProgramServer{}, float ave_time = launch_kernel(StreamConfig{nullptr, true},
kernel, kernel,
kGridSize, kGridSize,
kBlockSize, kBlockSize,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()), 0,
static_cast<BDataType*>(b_buf.GetDeviceBuffer()), static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
M, static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
N); M,
N);
b_buf.FromDevice(b_host_dev.mData.data()); b_buf.FromDevice(b_host_dev.mData.data());
......
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp" #include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp" #include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp" #include "ck/tile_program/tile/load_tile.hpp"
...@@ -25,7 +24,7 @@ template <typename ADataType, ...@@ -25,7 +24,7 @@ template <typename ADataType,
struct Reduce struct Reduce
{ {
#if 0 #if 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution() __device__ static constexpr auto MakeABlockTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -40,7 +39,7 @@ struct Reduce ...@@ -40,7 +39,7 @@ struct Reduce
Sequence<0, 0, 2, 4>>{}); Sequence<0, 0, 2, 4>>{});
} }
#elif 0 #elif 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution() __device__ static constexpr auto MakeABlockTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -55,7 +54,7 @@ struct Reduce ...@@ -55,7 +54,7 @@ struct Reduce
Sequence<0, 0, 2, 4>>{}); Sequence<0, 0, 2, 4>>{});
} }
#elif 1 #elif 1
__host__ __device__ static constexpr auto MakeABlockTileDistribution() __device__ static constexpr auto MakeABlockTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -71,8 +70,8 @@ struct Reduce ...@@ -71,8 +70,8 @@ struct Reduce
} }
#endif #endif
__host__ __device__ void operator()( __device__ void
ProgramServer& ps, const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const operator()(const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -81,7 +80,7 @@ struct Reduce ...@@ -81,7 +80,7 @@ struct Reduce
const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>( const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{}); p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{});
const auto iM = ps.get_block_id() * kMPerBlock; const auto iM = get_block_id() * kMPerBlock;
// A window // A window
auto a_block_window = auto a_block_window =
......
...@@ -4,9 +4,8 @@ ...@@ -4,9 +4,8 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp" #include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -64,14 +63,15 @@ int main(int argc, char* argv[]) ...@@ -64,14 +63,15 @@ int main(int argc, char* argv[])
const auto kernel = const auto kernel =
Softmax<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{}; Softmax<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{};
float ave_time = launch(ProgramServer{}, float ave_time = launch_kernel(StreamConfig{nullptr, true},
kernel, kernel,
kGridSize, kGridSize,
kBlockSize, kBlockSize,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()), 0,
static_cast<BDataType*>(b_buf.GetDeviceBuffer()), static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
M, static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
N); M,
N);
b_buf.FromDevice(b_host_dev.mData.data()); b_buf.FromDevice(b_host_dev.mData.data());
...@@ -81,8 +81,5 @@ int main(int argc, char* argv[]) ...@@ -81,8 +81,5 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
LogRangeAsType<float>(std::cout << "dev: ", b_host_dev.mData, ", ") << std::endl;
LogRangeAsType<float>(std::cout << "ref: ", b_host_ref.mData, ", ") << std::endl;
return !ck::utils::check_err(b_host_dev, b_host_ref); return !ck::utils::check_err(b_host_dev, b_host_ref);
} }
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp" #include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp" #include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp" #include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp" #include "ck/tile_program/tile/load_tile.hpp"
...@@ -25,7 +24,7 @@ template <typename ADataType, ...@@ -25,7 +24,7 @@ template <typename ADataType,
struct Softmax struct Softmax
{ {
#if 0 #if 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution() __device__ static constexpr auto MakeABlockTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -40,7 +39,7 @@ struct Softmax ...@@ -40,7 +39,7 @@ struct Softmax
Sequence<0, 0, 2, 4>>{}); Sequence<0, 0, 2, 4>>{});
} }
#elif 0 #elif 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution() __device__ static constexpr auto MakeABlockTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -55,7 +54,7 @@ struct Softmax ...@@ -55,7 +54,7 @@ struct Softmax
Sequence<0, 0, 2, 4>>{}); Sequence<0, 0, 2, 4>>{});
} }
#elif 1 #elif 1
__host__ __device__ static constexpr auto MakeABlockTileDistribution() __device__ static constexpr auto MakeABlockTileDistribution()
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -71,8 +70,8 @@ struct Softmax ...@@ -71,8 +70,8 @@ struct Softmax
} }
#endif #endif
__host__ __device__ void operator()( __device__ void
ProgramServer& ps, const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const operator()(const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
{ {
using namespace ck; using namespace ck;
using namespace ck::tile_program; using namespace ck::tile_program;
...@@ -84,7 +83,7 @@ struct Softmax ...@@ -84,7 +83,7 @@ struct Softmax
const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>( const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{}); p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{});
const auto iM = ps.get_block_id() * kMPerBlock; const auto iM = get_block_id() * kMPerBlock;
// A window // A window
auto a_block_window = auto a_block_window =
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
// Meta data for GPU
// TODO: do we need to take care of data alignment in code or it's done by compiler?
template <ck::index_t kSize>
struct MetaData
{
char p_data_[kSize];
ck::index_t size_ = 0;
ck::index_t pos_ = 0;
__host__ __device__ void reset()
{
size_ = 0;
pos_ = 0;
}
__device__ void reset_pos() { pos_ = 0; }
// push meta data on host
// TODO: correct forwarding?
template <typename T>
__host__ auto push(T&& a)
{
using Type = ck::remove_cvref_t<T>;
static_assert(std::is_trivially_copy_constructible_v<Type> &&
std::is_trivially_destructible_v<Type>);
assert(size_ + sizeof(Type) <= kSize);
// use placement new to create object copy
new(p_data_ + size_) Type(std::forward<T>(a));
size_ += sizeof(Type);
return ck::forwarder{}(a);
}
// pull meta data on device
// TODO: correct forwarding?
template <typename T>
__device__ auto pull()
{
using Type = ck::remove_cvref_t<T>;
static_assert(std::is_trivially_copy_constructible_v<Type> &&
std::is_trivially_destructible_v<Type>);
Type a(*reinterpret_cast<Type*>(p_data_ + pos_));
pos_ += sizeof(Type);
return a;
}
};
// namespace tp (for tile programming)
struct ProgramServer
{
// meta data on device
MetaData<1024> meta_data_;
__host__ void cpu_init() { meta_data_.reset(); }
__device__ void gpu_init() { meta_data_.reset_pos(); }
// push meta data on host
template <typename T>
__host__ auto operator()(T&& a)
{
return ck::forwarder{}(meta_data_.push(a));
}
// push meta data on host
template <typename T>
__device__ auto operator()(T&&)
{
return ck::forwarder{}(meta_data_.pull<T>());
}
//
__host__ static ck::index_t get_block_id() { return -1; }
__host__ static ck::index_t get_thread_id() { return -1; }
__host__ static ck::index_t get_grid_size() { return -1; }
__host__ static void block_sync_lds() {}
// TODO: correct forwarding?
template <typename T>
__host__ static constexpr auto read_first_lane(T&& a)
{
return ck::forwarder{}(a);
}
template <typename T>
__host__ T warp_shuffle_up(T, uint32_t)
{
return 0;
}
template <typename T>
__host__ T warp_shuffle_down(T, uint32_t)
{
return 0;
}
//
__device__ static ck::index_t get_block_id() { return ck::get_block_id(); }
__device__ static ck::index_t get_thread_id() { return ck::get_thread_id(); }
__device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
__device__ static void block_sync_lds() { ck::block_sync_lds(); }
template <typename T>
__device__ static constexpr auto read_first_lane(T&& a)
{
return __builtin_amdgcn_readfirstlane(a);
}
template <typename T>
__device__ T warp_shuffle_up(const T& var, uint32_t delta)
{
return ck::warp_shuffle_up(var, delta);
}
template <typename T>
__device__ T warp_shuffle_down(const T& var, uint32_t delta)
{
return ck::warp_shuffle_down(var, delta);
}
};
template <typename Server, typename Program, typename... Xs>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
__global__ void gpu_program_wrapper(Server server, Program f, Xs... xs)
{
server.gpu_init();
f(server, xs...);
}
template <typename Server, typename Program, typename... Xs>
float launch(Server server, Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
{
server.cpu_init();
f(server, xs...);
printf("meta data size %d\n", server.meta_data_.size_);
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
#if 0
gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
#else
return launch_and_time_kernel(StreamConfig{nullptr, true, 0},
gpu_program_wrapper<Server, Program, Xs...>,
grid_dim,
block_dim,
0,
server,
f,
xs...);
#endif
}
...@@ -9,6 +9,15 @@ ...@@ -9,6 +9,15 @@
#include "ck/stream_config.hpp" #include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp" #include "ck/host_utility/hip_check_error.hpp"
template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
#endif
__global__ void kernel_wrapper(Kernel f, Args... args)
{
f(args...);
}
template <typename... Args, typename F> template <typename... Args, typename F>
float launch_and_time_kernel(const StreamConfig& stream_config, float launch_and_time_kernel(const StreamConfig& stream_config,
F kernel, F kernel,
...@@ -142,3 +151,20 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -142,3 +151,20 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
return 0; return 0;
#endif #endif
} }
template <int MaxThreadPerBlock = CK_MAX_THREAD_PER_BLOCK,
int MinBlockPerCu = CK_MIN_BLOCK_PER_CU,
typename KernelImpl,
typename... Args>
float launch_kernel(const StreamConfig& stream_config,
KernelImpl kernel_impl,
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
Args... args)
{
const auto kernel = kernel_wrapper<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;
return launch_and_time_kernel(
stream_config, kernel, grid_dim, block_dim, lds_byte, kernel_impl, args...);
}
...@@ -340,63 +340,6 @@ struct BlockGemmARegBSmemCRegV1 ...@@ -340,63 +340,6 @@ struct BlockGemmARegBSmemCRegV1
return c_block_tensor; return c_block_tensor;
} }
// FIXME: remove: dummy host function for tile programming
template <typename CBlockTensor, typename ABlockTensorTmp, typename BBlockWindowTmp>
__host__ void operator()(CBlockTensor&, const ABlockTensorTmp&, const BBlockWindowTmp&) const
{
}
// FIXME: remove: dummy host function for tile programming
template <typename ABlockTensorTmp, typename BBlockWindowTmp>
__host__ auto operator()(const ABlockTensorTmp&, const BBlockWindowTmp&) const
{
static_assert(is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>>,
"wrong!");
constexpr index_t MPerBlock = ABlockTensorTmp{}.GetLengths()[Number<0>{}];
constexpr index_t NPerBlock = BBlockWindowTmp{}.GetWindowLengths()[Number<0>{}];
static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN, "wrong!");
constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
using WG = remove_cvref_t<decltype(config.template At<0>())>;
constexpr index_t MWarp = config.template At<1>();
constexpr index_t NWarp = config.template At<2>();
constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
constexpr auto c_block_outer_dstr_encoding = StaticTileDistributionEncoding<
Sequence<>,
Tuple<Sequence<MIterPerWarp, MWarp>, Sequence<NIterPerWarp, NWarp>>,
Tuple<Sequence<1, 2>>,
Tuple<Sequence<1, 1>>,
Sequence<1, 2>,
Sequence<0, 0>>{};
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
#if 0
// FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent distribution
static_assert(
is_same_v<remove_cvref_t<decltype(a_block_dstr_encode)>,
remove_cvref_t<decltype(
ABlockTensorTmp::GetBlockDistribution().GetStaticTensorDistributionEncoding())>>,
"wrong!");
#endif
// Construct C-Block-Tensor
auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
return c_block_tensor;
}
}; };
} // namespace block } // namespace block
......
...@@ -331,55 +331,6 @@ struct BlockGemmASmemBSmemCRegV1 ...@@ -331,55 +331,6 @@ struct BlockGemmASmemBSmemCRegV1
return c_block_tensor; return c_block_tensor;
} }
// FIXME: remove: dummy host function for tile programming
template <typename CBlockTensor, typename ABlockWindowTmp, typename BBlockWindowTmp>
__host__ void operator()(CBlockTensor&, const ABlockWindowTmp&, const BBlockWindowTmp&) const
{
}
// FIXME: remove: dummy host function for tile programming
template <typename ABlockWindowTmp, typename BBlockWindowTmp>
__host__ auto operator()(const ABlockWindowTmp&, const BBlockWindowTmp&) const
{
static_assert(is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
is_same_v<BDataType, typename BBlockWindowTmp::DataType>,
"wrong!");
constexpr index_t MPerBlock = ABlockWindowTmp{}.GetWindowLengths()[Number<0>{}];
constexpr index_t NPerBlock = BBlockWindowTmp{}.GetWindowLengths()[Number<0>{}];
static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN, "wrong!");
constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
using WG = remove_cvref_t<decltype(config.template At<0>())>;
constexpr index_t MWarp = config.template At<1>();
constexpr index_t NWarp = config.template At<2>();
constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
constexpr auto c_block_outer_dstr_encoding = StaticTileDistributionEncoding<
Sequence<>,
Tuple<Sequence<MIterPerWarp, MWarp>, Sequence<NIterPerWarp, NWarp>>,
Tuple<Sequence<1, 2>>,
Tuple<Sequence<1, 1>>,
Sequence<1, 2>,
Sequence<0, 0>>{};
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
static_assert(is_same_v<CDataType, typename WG::CDataType>, "wrong!");
auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
return c_block_tensor;
}
}; };
} // namespace block } // namespace block
......
...@@ -210,52 +210,6 @@ __device__ auto block_tile_reduce(const InDistributedTensor_& in_tensor, ...@@ -210,52 +210,6 @@ __device__ auto block_tile_reduce(const InDistributedTensor_& in_tensor,
return acc_tensor; return acc_tensor;
} }
// FIXME: dummy host function for tile program
template <typename AccDistributedTensor_,
typename InDistributedTensor_,
index_t... InReduceDims,
typename ReduceFunc>
__host__ void block_tile_reduce(AccDistributedTensor_&,
const InDistributedTensor_&,
Sequence<InReduceDims...>,
const ReduceFunc&)
{
}
// FIXME: dummy host function for tile program
template <typename AccDataType_,
typename InDistributedTensor_,
index_t... InReduceDims,
typename ReduceFunc,
typename InDataType_>
__host__ auto block_tile_reduce(const InDistributedTensor_&,
Sequence<InReduceDims...>,
const ReduceFunc&,
const InDataType_&)
{
using InDataType = typename InDistributedTensor_::DataType;
using AccDataType = remove_cvref_t<AccDataType_>;
static_assert(is_same_v<InDataType, remove_cvref_t<InDataType_>>, "wrong!");
// declare acc_tensor
constexpr auto acc_dstr = make_static_tile_distribution(
ck::tile_program::detail::make_reduce_tile_distribution_encoding(
InDistributedTensor_::GetTileDistribution().GetStaticTileDistributionEncoding(),
Sequence<InReduceDims...>{}));
auto acc_tensor = make_static_distributed_tensor<AccDataType>(acc_dstr);
return acc_tensor;
}
// FIXME: dummy host function for tile program
template <typename AccDistributedTensor_, typename ReduceFunc>
__host__ void block_tile_reduce_sync(AccDistributedTensor_&, const ReduceFunc&)
{
}
} // namespace block } // namespace block
} // namespace tile_program } // namespace tile_program
} // namespace ck } // namespace ck
...@@ -159,12 +159,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV1 ...@@ -159,12 +159,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
a_block_tile = load_tile(a_copy_dram_window); a_block_tile = load_tile(a_copy_dram_window);
b_block_tile = load_tile(b_copy_dram_window); b_block_tile = load_tile(b_copy_dram_window);
ProgramServer::block_sync_lds(); block_sync_lds();
// GEMM i // GEMM i
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
ProgramServer::block_sync_lds(); block_sync_lds();
// move to i + 2 // move to i + 2
move_tile_window(a_copy_dram_window, {0, kKPerBlock}); move_tile_window(a_copy_dram_window, {0, kKPerBlock});
...@@ -184,7 +184,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV1 ...@@ -184,7 +184,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
// tail // tail
{ {
ProgramServer::block_sync_lds(); block_sync_lds();
// GEMM num_loop - 1 // GEMM num_loop - 1
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
...@@ -194,10 +194,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV1 ...@@ -194,10 +194,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
} }
template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp> template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
__host__ __device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, __device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
const BDramBlockWindowTmp& b_dram_block_window_tmp, const BDramBlockWindowTmp& b_dram_block_window_tmp,
index_t num_loop, index_t num_loop,
void* p_smem) const void* p_smem) const
{ {
return operator()( return operator()(
a_dram_block_window_tmp, a_dram_block_window_tmp,
......
...@@ -161,12 +161,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2 ...@@ -161,12 +161,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
do do
{ {
ProgramServer::block_sync_lds(); block_sync_lds();
// GEMM i // GEMM i
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
ProgramServer::block_sync_lds(); block_sync_lds();
// move to i + 2 // move to i + 2
move_tile_window(a_copy_dram_window, {0, kKPerBlock}); move_tile_window(a_copy_dram_window, {0, kKPerBlock});
...@@ -190,12 +190,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2 ...@@ -190,12 +190,12 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
// tail // tail
{ {
ProgramServer::block_sync_lds(); block_sync_lds();
// GEMM num_loop - 2 // GEMM num_loop - 2
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
ProgramServer::block_sync_lds(); block_sync_lds();
// LDS write num_loop - 1 // LDS write num_loop - 1
const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
...@@ -204,7 +204,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV2 ...@@ -204,7 +204,7 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile); const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
store_tile(b_copy_lds_window, b_block_tile_tmp); store_tile(b_copy_lds_window, b_block_tile_tmp);
ProgramServer::block_sync_lds(); block_sync_lds();
// GEMM num_loop - 1 // GEMM num_loop - 1
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
...@@ -214,10 +214,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV2 ...@@ -214,10 +214,10 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
} }
template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp> template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
__host__ __device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, __device__ auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
const BDramBlockWindowTmp& b_dram_block_window_tmp, const BDramBlockWindowTmp& b_dram_block_window_tmp,
index_t num_loop, index_t num_loop,
void* p_smem) const void* p_smem) const
{ {
return operator()( return operator()(
a_dram_block_window_tmp, a_dram_block_window_tmp,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment