"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "4590418eb2b864d62a30e05037eb4a970650c939"
Unverified Commit 0e92deb7 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Tile program init bulk PR (#4)



Tile Program init bulk PR

---------
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>
Co-authored-by: default avatarPo-Yen, Chen <PoYen.Chen@amd.com>
parent 0077eeb3
...@@ -93,6 +93,7 @@ else() ...@@ -93,6 +93,7 @@ else()
-Wno-unused-command-line-argument -Wno-unused-command-line-argument
-Wno-weak-vtables -Wno-weak-vtables
-Wno-covered-switch-default -Wno-covered-switch-default
-Wno-unused-lambda-capture
-Wno-unsafe-buffer-usage -Wno-unsafe-buffer-usage
) )
else() else()
......
...@@ -39,7 +39,9 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl ...@@ -39,7 +39,9 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| // ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| // ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8,
ck::make_default_loop_scheduler(),
ck::PipelineVersion::v2>;
// clang-format on // clang-format on
using DeviceGemmInstance = DeviceGemmInstance1; using DeviceGemmInstance = DeviceGemmInstance1;
......
...@@ -21,6 +21,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o ...@@ -21,6 +21,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
......
...@@ -139,9 +139,9 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[]) ...@@ -139,9 +139,9 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
{ {
case 0: break; case 0: break;
case 1: case 1:
a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<ADataType>{-3.f, 3.f}(a_g_m_k);
b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<B0DataType>{-3.f, 3.f}(b0_g_k_n);
b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5}); ck::utils::FillUniformDistributionIntegerValue<B1DataType>{-3.f, 3.f}(b1_g_n_o);
break; break;
case 2: case 2:
a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
......
add_example_executable(example_hello_world hello_world.cpp)
add_example_executable(example_im2col im2col.cpp)
add_example_executable(example_gemm gemm.cpp)
add_example_executable(example_gemm_gemm gemm_gemm.cpp)
add_example_executable(example_reduce reduce.cpp)
#include <cstring>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "reference_gemm.hpp"
#include "gemm.hpp"
// elementwise lambda
struct AElementFunction
{
template <typename X>
__host__ __device__ auto operator()(const X& x) const
{
return x;
}
};
struct BElementFunction
{
template <typename X>
__host__ __device__ auto operator()(const X& x) const
{
return x;
}
};
struct CElementFunction
{
template <typename X>
__host__ __device__ auto operator()(const X& x) const
{
return x;
}
};
int main(int argc, char* argv[])
{
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using AccDataType = float;
using CDataType = ck::half_t;
ck::index_t M = 3328;
ck::index_t N = 4096;
ck::index_t K = 4096;
if(argc == 4)
{
M = std::stoi(argv[1]);
N = std::stoi(argv[2]);
K = std::stoi(argv[3]);
}
std::array<ck::index_t, 2> a_lengths{M, K};
std::array<ck::index_t, 2> a_strides{K, 1};
std::array<ck::index_t, 2> b_lengths{N, K};
std::array<ck::index_t, 2> b_strides{K, 1};
std::array<ck::index_t, 2> c_lengths{M, N};
std::array<ck::index_t, 2> c_strides{N, 1};
// host verify
Tensor<ADataType> a_host(a_lengths, a_strides);
Tensor<BDataType> b_host(b_lengths, b_strides);
Tensor<CDataType> c_host_ref(c_lengths, c_strides);
Tensor<CDataType> c_host_dev(c_lengths, c_strides);
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_host);
// reference gemm
reference_gemm<ADataType, ADataType, CDataType, float>(a_host, b_host, c_host_ref);
DeviceMem a_buf(sizeof(ADataType) * a_host.GetElementSpaceSize());
DeviceMem b_buf(sizeof(BDataType) * b_host.GetElementSpaceSize());
DeviceMem c_buf(sizeof(CDataType) * c_host_dev.GetElementSpaceSize());
a_buf.ToDevice(a_host.mData.data());
b_buf.ToDevice(b_host.mData.data());
constexpr ck::index_t kGemmMPerBlock = 256;
constexpr ck::index_t kGemmNPerBlock = 128;
constexpr ck::index_t kGemmKPerBlock = 32;
constexpr ck::index_t kBlockSize = 256;
ck::index_t kGridSize = (M / kGemmMPerBlock) * (N / kGemmNPerBlock);
std::cout << "grid size " << kGridSize << std::endl;
const auto gemm_kernel = Gemm<ADataType,
BDataType,
AccDataType,
CDataType,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
AElementFunction,
BElementFunction,
CElementFunction,
kBlockSize,
kGemmMPerBlock,
kGemmNPerBlock,
kGemmKPerBlock>{};
float ave_time = launch(ProgramServer{},
gemm_kernel,
kGridSize,
kBlockSize,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_buf.GetDeviceBuffer()),
M,
N,
K,
K,
K,
N,
AElementFunction{},
BElementFunction{},
CElementFunction{});
c_buf.FromDevice(c_host_dev.mData.data());
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
return !ck::utils::check_err(c_host_dev, c_host_ref);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
#include "ck/tile_program/warp_tile/warp_gemm.hpp"
#include "ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp"
#include "ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp"
#include "ck/tile_program/grid/grid_gemm.hpp"
#include "ck/tile_program/grid/grid_gemm_policy.hpp"
#include "ck/tile_program/grid/grid_gemm_problem.hpp"
// C = A * B
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename ALayout,
typename BLayout,
typename CLayout,
typename AElementFunction,
typename BElementFunction,
typename CElementFunction,
ck::index_t kBlockSize,
ck::index_t kMPerBlock,
ck::index_t kNPerBlock,
ck::index_t kKPerBlock>
struct Gemm
{
static_assert(std::is_same_v<ALayout, ck::tensor_layout::gemm::RowMajor> &&
std::is_same_v<BLayout, ck::tensor_layout::gemm::ColumnMajor> &&
std::is_same_v<CLayout, ck::tensor_layout::gemm::RowMajor>);
using Problem = ck::tile_program::grid::GridGemmProblem<ADataType,
BDataType,
AccDataType,
CDataType,
AElementFunction,
BElementFunction,
CElementFunction>;
using Policy = ck::tile_program::grid::GridGemmPolicy<
kBlockSize,
kMPerBlock,
kNPerBlock,
kKPerBlock,
ck::tile_program::block::BlockGemmPipelineAGmemBGmemCRegV2,
ck::Tuple<ck::tile_program::grid::DefaultBlock2TileMap,
ck::tile_program::block::BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy>>;
using GridGemm = ck::tile_program::grid::GridGemm<Problem, Policy>;
__host__ __device__ void operator()(ProgramServer& ps,
const ADataType* p_a,
const BDataType* p_b,
CDataType* p_c,
ck::index_t M,
ck::index_t N,
ck::index_t K,
ck::index_t Lda,
ck::index_t Ldb,
ck::index_t Ldc,
const AElementFunction& a_element_func,
const BElementFunction& b_element_func,
const CElementFunction& c_element_func) const
{
using namespace ck;
using namespace ck::tile_program;
using namespace ck::tile_program::block;
// FIXME: assume RCR layout
const auto a_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a, make_tuple(M, K), make_tuple(Lda, 1), Number<32>{}, Number<1>{});
const auto b_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_b, make_tuple(N, K), make_tuple(Ldb, 1), Number<32>{}, Number<1>{});
auto c_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_c, make_tuple(M, N), make_tuple(Ldc, 1), Number<32>{}, Number<1>{});
GridGemm{}(ps,
a_dram_grid,
b_dram_grid,
c_dram_grid,
a_element_func,
b_element_func,
c_element_func);
}
};
#include <cstring>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "reference_gemm.hpp"
#include "gemm_gemm.hpp"
int main(int argc, char* argv[])
{
using A0DataType = ck::half_t;
using B0DataType = ck::half_t;
using Acc0DataType = float;
using C0DataType = ck::half_t;
using B1DataType = ck::half_t;
using Acc1DataType = float;
using C1DataType = ck::half_t;
ck::index_t M0 = 13312;
ck::index_t N0 = 4096;
ck::index_t K0 = 128;
ck::index_t N1 = 128;
if(argc == 5)
{
M0 = std::stoi(argv[1]);
N0 = std::stoi(argv[2]);
K0 = std::stoi(argv[3]);
N1 = std::stoi(argv[4]);
}
std::array<ck::index_t, 2> a0_lengths{M0, K0};
std::array<ck::index_t, 2> a0_strides{K0, 1};
std::array<ck::index_t, 2> b0_lengths{N0, K0};
std::array<ck::index_t, 2> b0_strides{K0, 1};
std::array<ck::index_t, 2> c0_lengths{M0, N0};
std::array<ck::index_t, 2> c0_strides{N0, 1};
std::array<ck::index_t, 2> b1_lengths{N1, N0};
std::array<ck::index_t, 2> b1_strides{N0, 1};
std::array<ck::index_t, 2> c1_lengths{M0, N1};
std::array<ck::index_t, 2> c1_strides{N1, 1};
// host verify
Tensor<A0DataType> a0_host(a0_lengths, a0_strides);
Tensor<B0DataType> b0_host(b0_lengths, b0_strides);
Tensor<B1DataType> b1_host(b1_lengths, b1_strides);
Tensor<C0DataType> c0_host_ref(c0_lengths, c0_strides);
Tensor<C1DataType> c1_host_ref(c1_lengths, c1_strides);
Tensor<C1DataType> c1_host_dev(c1_lengths, c1_strides);
ck::utils::FillUniformDistributionIntegerValue<A0DataType>{-3.f, 3.f}(a0_host);
ck::utils::FillUniformDistributionIntegerValue<B0DataType>{-3.f, 3.f}(b0_host);
ck::utils::FillUniformDistributionIntegerValue<B1DataType>{-3.f, 3.f}(b1_host);
// reference gemm
reference_gemm<A0DataType, B0DataType, C0DataType, float>(a0_host, b0_host, c0_host_ref);
reference_gemm<C0DataType, B1DataType, C1DataType, float>(c0_host_ref, b1_host, c1_host_ref);
DeviceMem a0_buf(sizeof(A0DataType) * a0_host.GetElementSpaceSize());
DeviceMem b0_buf(sizeof(B0DataType) * b0_host.GetElementSpaceSize());
DeviceMem b1_buf(sizeof(B1DataType) * b1_host.GetElementSpaceSize());
DeviceMem c1_buf(sizeof(C1DataType) * c1_host_ref.GetElementSpaceSize());
a0_buf.ToDevice(a0_host.mData.data());
b0_buf.ToDevice(b0_host.mData.data());
b1_buf.ToDevice(b1_host.mData.data());
constexpr ck::index_t kM0PerBlock = 128;
constexpr ck::index_t kN0PerBlock = 128;
constexpr ck::index_t kK0PerBlock = 32;
constexpr ck::index_t kN1PerBlock = 128;
constexpr ck::index_t kBlockSize = 256;
ck::index_t kGridSize = (M0 / kM0PerBlock) * (N1 / kN1PerBlock);
std::cout << "grid size " << kGridSize << std::endl;
float ave_time = launch(ProgramServer{},
GemmGemm<A0DataType,
B0DataType,
Acc0DataType,
C0DataType,
B1DataType,
Acc1DataType,
C1DataType,
kBlockSize,
kM0PerBlock,
kN0PerBlock,
kK0PerBlock,
kN1PerBlock>{},
kGridSize,
kBlockSize,
static_cast<A0DataType*>(a0_buf.GetDeviceBuffer()),
static_cast<B0DataType*>(b0_buf.GetDeviceBuffer()),
static_cast<B1DataType*>(b1_buf.GetDeviceBuffer()),
static_cast<C1DataType*>(c1_buf.GetDeviceBuffer()),
M0,
N0,
K0,
N1,
K0, // Lda0
K0, // Ldb0
N0, // Ldb1
N1); // Ldc1
c1_buf.FromDevice(c1_host_dev.mData.data());
std::size_t flop = std::size_t(2) * M0 * N0 * K0 + std::size_t(2) * M0 * N1 * N0;
std::size_t num_btype = sizeof(A0DataType) * M0 * K0 + sizeof(B0DataType) * N0 * K0 +
sizeof(B1DataType) * N1 * N0 + sizeof(C1DataType) * M0 * N1;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
return !ck::utils::check_err(c1_host_dev, c1_host_ref);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/tile/tile_gemm_shape.hpp"
#include "ck/tile_program/warp_tile/warp_gemm.hpp"
#include "ck/tile_program/block_tile_pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp"
#include "ck/tile_program/block_tile_pipeline/block_gemm_pipeline_problem.hpp"
#include "ck/tile_program/block_tile/block_gemm_areg_bsmem_creg_v1.hpp"
// C1 = A0 * B0 * B1
template <typename A0DataType,
typename B0DataType,
typename Acc0DataType,
typename C0DataType,
typename B1DataType,
typename Acc1DataType,
typename C1DataType,
ck::index_t kBlockSize,
ck::index_t kM0PerBlock,
ck::index_t kN0PerBlock,
ck::index_t kK0PerBlock,
ck::index_t kN1PerBlock>
struct GemmGemm
{
// block gemm0 pipeline
using BlockGemm0Pipeline = ck::tile_program::block::BlockGemmPipelineAGmemBGmemCRegV2<
ck::tile_program::block::BlockGemmPipelineProblem<
A0DataType,
B0DataType,
Acc0DataType,
kBlockSize,
ck::tile_program::TileGemmShape<kM0PerBlock, kN0PerBlock, kK0PerBlock>>,
ck::tile_program::block::BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy>;
// block gemm1
using BlockGemm1 = ck::tile_program::block::BlockGemmARegBSmemCRegV1<
ck::tile_program::block::BlockGemmARegBSmemCRegV1Problem<
C0DataType,
B1DataType,
Acc1DataType,
kBlockSize,
ck::tile_program::TileGemmShape<kM0PerBlock, kN1PerBlock, kN0PerBlock>>,
ck::tile_program::block::BlockGemmARegBSmemCRegV1DefaultPolicy>;
#if 0
// 2d
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
using namespace ck;
constexpr index_t kNPerBlock = kN1PerBlock;
constexpr index_t kKPerBlock = kN0PerBlock;
constexpr auto b_lds_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(kNPerBlock, kKPerBlock), Number<32>{});
return b_lds_block_desc;
}
#else
// fake XOR
__host__ __device__ static constexpr auto MakeB1LdsBlockDescriptor()
{
using namespace ck;
using BDataType = B1DataType;
constexpr index_t kNPerBlock = kN1PerBlock;
constexpr index_t kKPerBlock = kN0PerBlock;
constexpr auto b_lds_block_desc_d1_d2_d3 = make_naive_tensor_descriptor_packed(
make_tuple(kNPerBlock / 2, 2, kKPerBlock), Number<kKPerBlock>{});
constexpr index_t kK1 = 16 / sizeof(BDataType);
constexpr auto b_lds_block_desc_d4_d5_d6 = transform_tensor_descriptor(
b_lds_block_desc_d1_d2_d3,
make_tuple(make_xor_transform(make_tuple(kNPerBlock / 2, kKPerBlock), kK1),
make_pass_through_transform(2)),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
b_lds_block_desc_d4_d5_d6,
make_tuple(make_merge_transform(make_tuple(kNPerBlock / 2, 2)),
make_pass_through_transform(kKPerBlock)),
make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return b_lds_block_desc_n_k;
}
#endif
__host__ __device__ static constexpr auto MakeB1DramTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
using BDataType = B1DataType;
constexpr index_t kNPerBlock = kN1PerBlock;
constexpr index_t kKPerBlock = kN0PerBlock;
constexpr index_t K1 = 16 / sizeof(BDataType);
constexpr index_t K0 = kKPerBlock / K1;
constexpr index_t N2 = get_warp_size() / K0;
constexpr index_t N1 = kBlockSize / get_warp_size();
constexpr index_t N0 = kNPerBlock / (N2 * N1);
return make_static_tile_distribution(
StaticTileDistributionEncoding<Sequence<1>,
Tuple<Sequence<N0, N1, N2>, Sequence<K0, K1>>,
Tuple<Sequence<1>, Sequence<1, 2>>,
Tuple<Sequence<1>, Sequence<2, 0>>,
Sequence<1, 2>,
Sequence<0, 1>>{});
}
__host__ __device__ static constexpr ck::index_t GetStaticLdsSize()
{
using namespace ck;
return math::max(BlockGemm0Pipeline::GetStaticLdsSize(),
static_cast<index_t>(MakeB1LdsBlockDescriptor().GetElementSpaceSize() *
sizeof(B1DataType)));
}
__host__ __device__ void operator()(ProgramServer& ps,
const A0DataType* p_a0,
const B0DataType* p_b0,
const B1DataType* p_b1,
C1DataType* p_c1,
ck::index_t M0,
ck::index_t N0,
ck::index_t K0,
ck::index_t N1,
ck::index_t Lda0,
ck::index_t Ldb0,
ck::index_t Ldb1,
ck::index_t Ldc1)
{
using namespace ck;
using namespace ck::tile_program;
using namespace ck::tile_program::block;
// FIXME: assume layout A0[M0, K0], B0[N0, K0], B1[N1, N0], C1[M0, N1]
const auto a0_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a0, make_tuple(M0, K0), make_tuple(Lda0, 1), Number<32>{}, Number<1>{});
const auto b0_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_b0, make_tuple(N0, K0), make_tuple(Ldb0, 1), Number<32>{}, Number<1>{});
const auto b1_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_b1, make_tuple(N1, N0), make_tuple(Ldb1, 1), Number<32>{}, Number<1>{});
// divide problem
const auto id_block = ps.get_block_id();
const auto num_tile_m0 = M0 / kM0PerBlock;
const auto num_tile_n1 = N1 / kN1PerBlock;
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m0, num_tile_n1)));
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto iM0 = ps.read_first_lane(id_tile.At<0>() * kM0PerBlock);
const auto iN1 = ps.read_first_lane(id_tile.At<1>() * kN1PerBlock);
__shared__ char p_smem_char[GetStaticLdsSize()];
// A0 DRAM block window
auto a0_dram_block_window = make_tile_window(
a0_dram_grid, make_tuple(Number<kM0PerBlock>{}, Number<kK0PerBlock>{}), {iM0, 0});
// B0 DRAM block window
auto b0_dram_block_window = make_tile_window(
b0_dram_grid, make_tuple(Number<kN0PerBlock>{}, Number<kK0PerBlock>{}), {0, 0});
// Block GEMM0 pipeline
constexpr auto block_gemm0_pipeline = BlockGemm0Pipeline{};
// B1 DRAM window
auto b1_dram_block_window =
make_tile_window(b1_dram_grid,
make_tuple(Number<kN1PerBlock>{}, Number<kN0PerBlock>{}),
{iN1, 0},
MakeB1DramTileDistribution());
// B1 LDS tensor view: occupies the same LDS allocation as block_gemm0_pipeline
auto b1_lds_block = make_tensor_view<AddressSpaceEnum::Lds>(
reinterpret_cast<B1DataType*>(p_smem_char), MakeB1LdsBlockDescriptor());
auto b1_lds_block_window = make_tile_window(
b1_lds_block, make_tuple(Number<kN1PerBlock>{}, Number<kN0PerBlock>{}), {0, 0});
// Bock GEMM1
constexpr auto block_gemm1 = BlockGemm1{};
// Acc1 tile
auto acc1_block_tile = decltype(block_gemm1(
tile_elementwise_in(
type_convert<C0DataType, Acc0DataType>,
block_gemm0_pipeline(a0_dram_block_window, b0_dram_block_window, 0, nullptr)),
b1_dram_block_window)){};
// init Acc1
tile_elementwise_inout([](auto& acc1) { acc1 = 0; }, acc1_block_tile);
#if 0
index_t iN0 = 0;
do
{
// Block GEMM0 pipeline: acc0 = a0 * b0
const auto acc0_block_tile = block_gemm0_pipeline(
a0_dram_block_window, b0_dram_block_window, K0 / kK0PerBlock, p_smem_char);
// type cast acc0 into c0
const auto c0_block_tile =
tile_elementwise_in(type_convert<C0DataType, Acc0DataType>, acc0_block_tile);
// Block GEMM1: acc1 += c0 * b1
{
// load b1
const auto b1_block_tile = load_tile(b1_dram_block_window);
// wait for block gemm0 pipeline to finish
ps.block_sync_lds();
store_tile(b1_lds_block_window, b1_block_tile);
// wait for store_tile to finish
ps.block_sync_lds();
// acc1 += c0 * b1
block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window);
// wait for block gemm1 to finish
ps.block_sync_lds();
}
// move tile windows
move_tile_window(b0_dram_block_window, {kN0PerBlock, 0});
move_tile_window(b1_dram_block_window, {0, kN0PerBlock});
iN0 += kN0PerBlock;
} while(iN0 < N0);
#else
index_t iN0 = 0;
do
{
// load b1
const auto b1_block_tile = load_tile(b1_dram_block_window);
// Block GEMM0 pipeline: acc0 = a0 * b0
const auto acc0_block_tile = block_gemm0_pipeline(
a0_dram_block_window, b0_dram_block_window, K0 / kK0PerBlock, p_smem_char);
// type cast acc0 into c0
const auto c0_block_tile =
tile_elementwise_in(type_convert<C0DataType, Acc0DataType>, acc0_block_tile);
// Block GEMM1: acc1 += c0 * b1
{
// wait for block gemm0 pipeline to finish
ps.block_sync_lds();
store_tile(b1_lds_block_window, b1_block_tile);
// wait for store_tile to finish
ps.block_sync_lds();
// acc1 += c0 * b1
block_gemm1(acc1_block_tile, c0_block_tile, b1_lds_block_window);
// wait for block gemm1 to finish
ps.block_sync_lds();
}
// move tile windows
move_tile_window(b0_dram_block_window, {kN0PerBlock, 0});
move_tile_window(b1_dram_block_window, {0, kN0PerBlock});
iN0 += kN0PerBlock;
} while(iN0 < N0);
#endif
// type cast acc1 into c1
const auto c1_block_tile =
tile_elementwise_in(type_convert<C1DataType, Acc1DataType>, acc1_block_tile);
// store c1
auto c1_dram_grid = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_c1, make_tuple(M0, N1), make_tuple(Ldc1, 1), Number<32>{}, Number<1>{});
auto c1_dram_window =
make_tile_window(c1_dram_grid,
make_tuple(Number<kM0PerBlock>{}, Number<kN1PerBlock>{}),
{iM0, iN1},
c1_block_tile.GetTileDistribution());
store_tile(c1_dram_window, c1_block_tile);
}
};
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "tile_program.hpp"
#include "ck/library/utility/device_memory.hpp"
// ProgramServer contains a "meta data buffer"
// host evaluate the expression inside ps(), and push the result into meta data buffer
// ProgramServer send meta data buffer to GPU as kernel arguement
// device read (not evaluate) the value of the expression inside ps() from meta data buffer
struct HelloWorld
{
__host__ __device__ void operator()(ProgramServer& ps, int x, int y, int* res)
{
#if 1
auto r0 = ps(x + y);
auto r1 = ps(x - y);
res[0] = r0;
res[1] = r1;
#elif 1
(void)x;
(void)y;
auto r0 = ps.get_thread_id();
auto r1 = ps.warp_shuffle_up(r0, 1);
auto r2 = ps.warp_shuffle_down(r0, 1);
printf("tid %d, r0 %d, r1 %d, r2 %d\n", ps.get_thread_id(), r0, r1, r2);
res[0] = r0;
res[1] = r2;
#endif
}
};
int main()
{
int x = 100;
int y = 101;
DeviceMem res_dev_buf(2 * sizeof(int));
launch(ProgramServer{},
HelloWorld{},
1,
64,
x,
y,
static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
int res_host[2];
res_dev_buf.FromDevice(&res_host);
printf("x+y=: %d\n", res_host[0]);
printf("x-y=: %d\n", res_host[1]);
return 0;
}
#include <string_view>
#include <tuple>
#include <array>
#include <utility>
#include <type_traits>
#include <cstring>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
#include "ck/tile_program/tile/store_tile.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <typename T>
void reference_im2col(Tensor<T>& in_mtx_host_ref,
const Tensor<T>& in_host,
int /*N*/,
int /*K*/,
int C,
int /*Y*/,
int X,
int Hi,
int Wi,
int Ho,
int Wo,
int ConvStrideH,
int ConvStrideW,
int ConvDilationH,
int ConvDilationW,
int InLeftPadH,
int InLeftPadW,
int /*InRightPadH*/,
int /*InRightPadW*/)
{
int GemmM = in_mtx_host_ref.GetLengths()[0];
int GemmK = in_mtx_host_ref.GetLengths()[1];
for(int gemm_m = 0; gemm_m < GemmM; ++gemm_m)
{
int mtmp = gemm_m;
int n = mtmp / (Ho * Wo);
mtmp -= n * Ho * Wo;
int ho = mtmp / Wo;
int wo = mtmp - ho * Wo;
for(int gemm_k = 0; gemm_k < GemmK; ++gemm_k)
{
int ktmp = gemm_k;
int y = ktmp / (X * C);
ktmp -= y * X * C;
int x = ktmp / C;
int c = ktmp - x * C;
int hi = y * ConvDilationH + ho * ConvStrideH - InLeftPadH;
int wi = x * ConvDilationW + wo * ConvStrideW - InLeftPadW;
bool inbound = (hi >= 0 && hi < Hi && wi >= 0 && wi < Wi);
in_mtx_host_ref(gemm_m, gemm_k) = inbound ? in_host(n, hi, wi, c) : 0;
}
}
}
template <ck::index_t NDimSpatial,
typename T,
ck::index_t kBlockSize,
ck::index_t kMPerBlock,
ck::index_t kKPerBlock>
struct Im2Col
{
__host__ __device__ static constexpr auto MakeBlockCopyTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
constexpr index_t NumWarp = kBlockSize / get_warp_size();
constexpr index_t K1 = 16 / sizeof(T);
constexpr index_t K0 = kKPerBlock / K1;
constexpr index_t M2 = get_warp_size() / K0;
constexpr index_t M1 = NumWarp;
constexpr index_t M0 = kMPerBlock / (M1 * M2);
return make_static_tile_distribution(
StaticTileDistributionEncoding<Sequence<>,
Tuple<Sequence<M0, M1, M2>, Sequence<K0, K1>>,
Tuple<Sequence<1>, Sequence<1, 2>>,
Tuple<Sequence<1>, Sequence<2, 0>>,
Sequence<1, 2>,
Sequence<0, 1>>{});
}
template <typename Server>
__host__ __device__ void
operator()(Server& ps,
const std::array<ck::index_t, NDimSpatial + 2>& a_n_wis_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* a_n_wis_c_strides */,
const std::array<ck::index_t, NDimSpatial + 2>& b_k_xs_c_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* b_k_xs_c_strides */,
const std::array<ck::index_t, NDimSpatial + 2>& c_n_wos_k_lengths,
const std::array<ck::index_t, NDimSpatial + 2>& /* c_n_wos_k_strides */,
const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NDimSpatial>& input_left_pads,
const std::array<ck::index_t, NDimSpatial>& input_right_pads,
//
const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths,
const std::array<ck::index_t, 2> a_gemmm_gemmk_strides,
//
const T* p_a_img,
T* p_a_mtx)
{
using namespace ck;
using namespace ck::tile_program;
const index_t N = a_n_wis_c_lengths[0];
const index_t C = a_n_wis_c_lengths[3];
const index_t Hi = a_n_wis_c_lengths[1];
const index_t Wi = a_n_wis_c_lengths[2];
const index_t Ho = c_n_wos_k_lengths[1];
const index_t Wo = c_n_wos_k_lengths[2];
const index_t Y = b_k_xs_c_lengths[1];
const index_t X = b_k_xs_c_lengths[2];
const index_t ConvStrideH = conv_filter_strides[0];
const index_t ConvStrideW = conv_filter_strides[1];
const index_t ConvDilationH = conv_filter_dilations[0];
const index_t ConvDilationW = conv_filter_dilations[1];
const index_t InLeftPadH = input_left_pads[0];
const index_t InLeftPadW = input_left_pads[1];
const index_t InRightPadH = input_right_pads[0];
const index_t InRightPadW = input_right_pads[1];
const auto a_n_hi_wi_c = make_naive_tensor_view_packed<AddressSpaceEnum::Global>(
p_a_img, make_tuple(N, Hi, Wi, C), Number<32>{});
const auto a_n_hip_wip_c = transform_tensor_view(
a_n_hi_wi_c,
make_tuple(make_pass_through_transform(N),
make_pad_transform(Hi, InLeftPadH, InRightPadH),
make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const auto a_n_y_ho_x_wo_c = transform_tensor_view(
a_n_hip_wip_c,
make_tuple(
make_pass_through_transform(N),
make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
const auto src_gemmm_gemmk =
transform_tensor_view(a_n_y_ho_x_wo_c,
make_tuple(ps(make_merge_transform(make_tuple(N, Ho, Wo))),
ps(make_merge_transform(make_tuple(Y, X, C)))),
make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst_gemmm_gemmk = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a_mtx,
make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1]),
Number<32>{},
Number<1>{});
const auto numGemmM = a_gemmm_gemmk_lengths[0];
const auto numGemmK = a_gemmm_gemmk_lengths[1];
const auto id_block = ps.get_block_id();
const auto num_tile_m = ps.read_first_lane(numGemmM / kMPerBlock);
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m)));
const auto i_gemmm_gemmk = block2tile.CalculateBottomIndex(make_multi_index(id_block));
const auto iGemmM = ps.read_first_lane(i_gemmm_gemmk[0]) * kMPerBlock;
// src window
auto src_block_window =
make_tile_window(src_gemmm_gemmk,
make_tuple(Number<kMPerBlock>{}, Number<kKPerBlock>{}),
{iGemmM, 0},
MakeBlockCopyTileDistribution());
// dst window
auto dst_block_window = make_tile_window(
dst_gemmm_gemmk, make_tuple(Number<kMPerBlock>{}, Number<kKPerBlock>{}), {iGemmM, 0});
index_t iGemmK = 0;
do
{
const auto block_tile = load_tile(src_block_window);
store_tile(dst_block_window, block_tile);
move_tile_window(src_block_window, {0, kKPerBlock});
move_tile_window(dst_block_window, {0, kKPerBlock});
iGemmK += kKPerBlock;
} while(iGemmK < numGemmK);
}
};
int main()
{
using DataType = ck::half_t;
constexpr ck::index_t NumDimSpatial = 2;
ck::index_t N = 128;
ck::index_t K = 1;
ck::index_t C = 256;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 28;
ck::index_t Wi = 28;
ck::index_t Ho = 14;
ck::index_t Wo = 14;
std::array<ck::index_t, NumDimSpatial + 2> in_lengths{N, Hi, Wi, C};
std::array<ck::index_t, NumDimSpatial + 2> in_strides{0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 2> wei_lengths{K, Y, X, C};
std::array<ck::index_t, NumDimSpatial + 2> wei_strides{0, 0, 0, 1};
std::array<ck::index_t, NumDimSpatial + 2> out_lengths{N, Ho, Wo, K};
std::array<ck::index_t, NumDimSpatial + 2> out_strides{0, 0, 0, 1};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
std::array<ck::index_t, NumDimSpatial> filter_strides{2, 2};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
// matrix
std::array<ck::index_t, 2> in_mtx_lengths{N * Ho * Wo, Y * X * C};
std::array<ck::index_t, 2> in_mtx_strides{0, 1};
std::partial_sum(rbegin(in_mtx_lengths),
std::prev(rend(in_mtx_lengths)),
std::next(rbegin(in_mtx_strides)),
std::multiplies<>{});
// host verify
Tensor<DataType> in_host(in_lengths, in_strides);
Tensor<DataType> in_mtx_host_ref(in_mtx_lengths, in_mtx_strides);
Tensor<DataType> in_mtx_host_dev(in_mtx_lengths, in_mtx_strides);
std::cout << " image tensor element size: " << in_host.GetElementSize() << std::endl;
std::cout << "matrix tensor element size: " << in_mtx_host_ref.GetElementSize() << std::endl;
std::cout << " image tensor element space size: " << in_host.GetElementSpaceSize() << std::endl;
std::cout << "matrix tensor element sapce size: " << in_mtx_host_ref.GetElementSpaceSize()
<< std::endl;
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(in_host);
reference_im2col(in_mtx_host_ref,
in_host,
N,
K,
C,
Y,
X,
Hi,
Wi,
Ho,
Wo,
filter_strides[0],
filter_strides[1],
filter_dilations[0],
filter_dilations[1],
input_left_pads[0],
input_left_pads[1],
input_right_pads[0],
input_right_pads[1]);
DeviceMem in_buf(sizeof(DataType) * in_host.GetElementSpaceSize());
DeviceMem in_mtx_buf(sizeof(DataType) * in_mtx_host_ref.GetElementSpaceSize());
in_buf.ToDevice(in_host.mData.data());
constexpr ck::index_t kBlockSize = 256;
constexpr ck::index_t kGemmMPerBlock = 256;
constexpr ck::index_t kGemmKPerBlock = 128;
ck::index_t kGridSize = (N * Ho * Wo) / kGemmMPerBlock;
float ave_time = launch(ProgramServer{},
Im2Col<2, DataType, kBlockSize, kGemmMPerBlock, kGemmKPerBlock>{},
kGridSize,
kBlockSize,
in_lengths,
in_strides,
wei_lengths,
wei_strides,
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
//
in_mtx_lengths,
in_mtx_strides,
//
static_cast<DataType*>(in_buf.GetDeviceBuffer()),
static_cast<DataType*>(in_mtx_buf.GetDeviceBuffer()));
std::size_t num_btype = sizeof(DataType) * in_host.GetElementSize() +
sizeof(DataType) * in_mtx_host_ref.GetElementSize();
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
in_mtx_buf.FromDevice(in_mtx_host_dev.mData.data());
return !ck::utils::check_err(in_mtx_host_dev, in_mtx_host_ref);
}
#include <cstring>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor_view.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "reduce.hpp"
template <typename ADataType, typename AccDataType, typename BDataType>
void reference_reduce(const Tensor<ADataType>& a_m_n, Tensor<BDataType>& b_m)
{
auto f = [&](auto m) {
const int N = a_m_n.mDesc.GetLengths()[1];
AccDataType v_acc = 0;
for(int n = 0; n < N; ++n)
{
const ADataType v_a = a_m_n(m, n);
v_acc += v_a;
}
b_m(m) = ck::type_convert<BDataType>(v_acc);
};
make_ParallelTensorFunctor(f, b_m.mDesc.GetLengths()[0])(std::thread::hardware_concurrency());
}
int main(int argc, char* argv[])
{
using ADataType = float;
using AccDataType = float;
using BDataType = float;
ck::index_t M = 3328;
ck::index_t N = 4096;
if(argc == 3)
{
M = std::stoi(argv[1]);
N = std::stoi(argv[2]);
}
std::array<ck::index_t, 2> a_lengths{M, N};
std::array<ck::index_t, 2> a_strides{N, 1};
std::array<ck::index_t, 2> b_lengths{M};
std::array<ck::index_t, 2> b_strides{1};
// host verify
Tensor<ADataType> a_host(a_lengths, a_strides);
Tensor<BDataType> b_host_ref(b_lengths, b_strides);
Tensor<BDataType> b_host_dev(b_lengths, b_strides);
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
// reference
reference_reduce<ADataType, AccDataType, BDataType>(a_host, b_host_ref);
DeviceMem a_buf(sizeof(ADataType) * a_host.GetElementSpaceSize());
DeviceMem b_buf(sizeof(BDataType) * b_host_ref.GetElementSpaceSize());
a_buf.ToDevice(a_host.mData.data());
constexpr ck::index_t kMPerBlock = 128;
constexpr ck::index_t kNPerBlock = 128;
constexpr ck::index_t kBlockSize = 256;
ck::index_t kGridSize = (M / kMPerBlock);
std::cout << "grid size " << kGridSize << std::endl;
const auto kernel =
Reduce<ADataType, AccDataType, BDataType, kBlockSize, kMPerBlock, kNPerBlock>{};
float ave_time = launch(ProgramServer{},
kernel,
kGridSize,
kBlockSize,
static_cast<ADataType*>(a_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_buf.GetDeviceBuffer()),
M,
N);
b_buf.FromDevice(b_host_dev.mData.data());
std::size_t num_btype = sizeof(ADataType) * M * N + sizeof(BDataType) * M;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
return !ck::utils::check_err(b_host_dev, b_host_ref);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "tile_program.hpp"
#include "ck/tile_program/tile/tile_distribution.hpp"
#include "ck/tile_program/tile/tile_window.hpp"
#include "ck/tile_program/tile/load_tile.hpp"
#include "ck/tile_program/tile/store_tile.hpp"
#include "ck/tile_program/tile/tile_elementwise.hpp"
#include "ck/tile_program/block_tile/block_reduce.hpp"
template <typename ADataType,
typename AccDataType,
typename BDataType,
ck::index_t kBlockSize,
ck::index_t kMPerBlock,
ck::index_t kNPerBlock>
struct Reduce
{
#if 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
// 2x2 wave
return make_static_tile_distribution(
StaticTileDistributionEncoding<Sequence<>,
Tuple<Sequence<2, 2, 4, 2, 4>, Sequence<2, 2, 32>>,
Tuple<Sequence<1, 2>, Sequence<1, 2>>,
Tuple<Sequence<1, 1>, Sequence<3, 2>>,
Sequence<1, 2, 1, 1>,
Sequence<0, 0, 2, 4>>{});
}
#elif 0
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
// 2x2 wave
return make_static_tile_distribution(
StaticTileDistributionEncoding<Sequence<>,
Tuple<Sequence<2, 2, 32>, Sequence<2, 2, 4, 2, 4>>,
Tuple<Sequence<2, 1>, Sequence<2, 1>>,
Tuple<Sequence<1, 1>, Sequence<3, 2>>,
Sequence<2, 1, 2, 2>,
Sequence<0, 0, 2, 4>>{});
}
#elif 1
__host__ __device__ static constexpr auto MakeABlockTileDistribution()
{
using namespace ck;
using namespace ck::tile_program;
// 4x1 wave
return make_static_tile_distribution(
StaticTileDistributionEncoding<Sequence<>,
Tuple<Sequence<1, 4, 4, 2, 4>, Sequence<4, 1, 32>>,
Tuple<Sequence<1, 2>, Sequence<1, 2>>,
Tuple<Sequence<1, 1>, Sequence<3, 2>>,
Sequence<1, 2, 1, 1>,
Sequence<0, 0, 2, 4>>{});
}
#endif
__host__ __device__ void operator()(
ProgramServer& ps, const ADataType* p_a, BDataType* p_b, ck::index_t M, ck::index_t N) const
{
using namespace ck;
using namespace ck::tile_program;
using namespace ck::tile_program::block;
const auto a_m_n = make_naive_tensor_view<AddressSpaceEnum::Global>(
p_a, make_tuple(M, N), make_tuple(N, 1), Number<32>{}, Number<1>{});
const auto iM = ps.get_block_id() * kMPerBlock;
// A window
auto a_block_window =
make_tile_window(a_m_n,
make_tuple(Number<kMPerBlock>{}, Number<kNPerBlock>{}),
{iM, 0},
MakeABlockTileDistribution());
const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
const ADataType reduce_init_value = 0;
constexpr auto reduce_dims = Sequence<1>{};
// Acc tile
// FIXME: support cross warp reduction
auto acc_block_tensor = decltype(block_tile_reduce<AccDataType>(
load_tile(a_block_window), reduce_dims, f_reduce, reduce_init_value)){};
// init Acc tile
tile_elementwise_inout(
[&](auto& acc) { acc = type_convert<AccDataType>(reduce_init_value); },
acc_block_tensor);
// loop
index_t iN = 0;
do
{
const auto a_block_tensor = load_tile(a_block_window);
// FIXME: support cross warp reduction
block_tile_reduce(acc_block_tensor, a_block_tensor, reduce_dims, f_reduce);
move_tile_window(a_block_window, {0, kNPerBlock});
iN += kNPerBlock;
} while(iN < N);
// FIXME: support cross warp reduction
block_tile_reduce_sync(acc_block_tensor, f_reduce);
// convert acc_block_tensor to b_block_tensor
const auto b_block_tensor = tile_elementwise_in(
[](const auto& acc) { return type_convert<BDataType>(acc); }, acc_block_tensor);
// B
const auto b_m = make_naive_tensor_view_packed<AddressSpaceEnum::Global>(
p_b, make_tuple(M), Number<32>{});
// B window
auto b_block_window = make_tile_window(b_m, make_tuple(Number<kMPerBlock>{}), {iM});
// store B tile
store_tile(b_block_window, b_block_tensor);
}
};
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/host_tensor.hpp"
template <typename ADataType, typename BDataType, typename CDataType, typename AccDataType>
void reference_gemm(const Tensor<ADataType>& a_m_k,
const Tensor<BDataType>& b_n_k,
Tensor<CDataType>& c_m_n)
{
auto f_mk_kn_mn = [&](auto m, auto n) {
const int K = a_m_k.mDesc.GetLengths()[1];
AccDataType v_acc = 0;
for(int k = 0; k < K; ++k)
{
ADataType v_a = a_m_k(m, k);
BDataType v_b = b_n_k(n, k);
v_acc += ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
}
c_m_n(m, n) = ck::type_convert<CDataType>(v_acc);
};
make_ParallelTensorFunctor(f_mk_kn_mn,
c_m_n.mDesc.GetLengths()[0],
c_m_n.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/ck.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
// Meta data for GPU
// TODO: do we need to take care of data alignment in code or it's done by compiler?
template <ck::index_t kSize>
struct MetaData
{
char p_data_[kSize];
ck::index_t size_ = 0;
ck::index_t pos_ = 0;
__host__ __device__ void reset()
{
size_ = 0;
pos_ = 0;
}
__device__ void reset_pos() { pos_ = 0; }
// push meta data on host
// TODO: correct forwarding?
template <typename T>
__host__ auto push(T&& a)
{
using Type = ck::remove_cvref_t<T>;
static_assert(std::is_trivially_copy_constructible_v<Type> &&
std::is_trivially_destructible_v<Type>);
assert(size_ + sizeof(Type) <= kSize);
// use placement new to create object copy
new(p_data_ + size_) Type(std::forward<T>(a));
size_ += sizeof(Type);
return ck::forwarder{}(a);
}
// pull meta data on device
// TODO: correct forwarding?
template <typename T>
__device__ auto pull()
{
using Type = ck::remove_cvref_t<T>;
static_assert(std::is_trivially_copy_constructible_v<Type> &&
std::is_trivially_destructible_v<Type>);
Type a(*reinterpret_cast<Type*>(p_data_ + pos_));
pos_ += sizeof(Type);
return a;
}
};
// namespace tp (for tile programming)
struct ProgramServer
{
// meta data on device
MetaData<1024> meta_data_;
__host__ void cpu_init() { meta_data_.reset(); }
__device__ void gpu_init() { meta_data_.reset_pos(); }
// push meta data on host
template <typename T>
__host__ auto operator()(T&& a)
{
return ck::forwarder{}(meta_data_.push(a));
}
// push meta data on host
template <typename T>
__device__ auto operator()(T&&)
{
return ck::forwarder{}(meta_data_.pull<T>());
}
//
__host__ static ck::index_t get_block_id() { return -1; }
__host__ static ck::index_t get_thread_id() { return -1; }
__host__ static ck::index_t get_grid_size() { return -1; }
__host__ static void block_sync_lds() {}
// TODO: correct forwarding?
template <typename T>
__host__ static constexpr auto read_first_lane(T&& a)
{
return ck::forwarder{}(a);
}
template <typename T>
__host__ T warp_shuffle_up(T, uint32_t)
{
return 0;
}
template <typename T>
__host__ T warp_shuffle_down(T, uint32_t)
{
return 0;
}
//
__device__ static ck::index_t get_block_id() { return ck::get_block_id(); }
__device__ static ck::index_t get_thread_id() { return ck::get_thread_id(); }
__device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
__device__ static void block_sync_lds() { ck::block_sync_lds(); }
template <typename T>
__device__ static constexpr auto read_first_lane(T&& a)
{
return __builtin_amdgcn_readfirstlane(a);
}
template <typename T>
__device__ T warp_shuffle_up(const T& var, uint32_t delta)
{
return ck::warp_shuffle_up(var, delta);
}
template <typename T>
__device__ T warp_shuffle_down(const T& var, uint32_t delta)
{
return ck::warp_shuffle_down(var, delta);
}
};
template <typename Server, typename Program, typename... Xs>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
__global__ void gpu_program_wrapper(Server server, Program f, Xs... xs)
{
server.gpu_init();
f(server, xs...);
}
template <typename Server, typename Program, typename... Xs>
float launch(Server server, Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
{
server.cpu_init();
f(server, xs...);
printf("meta data size %d\n", server.meta_data_.size_);
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
#if 0
gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
#else
return launch_and_time_kernel(StreamConfig{nullptr, true, 0},
gpu_program_wrapper<Server, Program, Xs...>,
grid_dim,
block_dim,
0,
server,
f,
xs...);
#endif
}
...@@ -134,7 +134,7 @@ ...@@ -134,7 +134,7 @@
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
// experimental feature: multi index implemented as array // experimental feature: multi index implemented as array
#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 #define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 1
// experimental feature: static tensor descriptor // experimental feature: static tensor descriptor
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
...@@ -150,9 +150,6 @@ ...@@ -150,9 +150,6 @@
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
// experimental feature: in-regsiter sub-dword transpose
#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
// experimental feature: merge transformation use magic number division // experimental feature: merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
...@@ -160,9 +157,6 @@ ...@@ -160,9 +157,6 @@
// pointer of scalar // pointer of scalar
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
// experimental feature: optimize for inter-wave scheduling policy // experimental feature: optimize for inter-wave scheduling policy
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 1 #define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 1
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1 #define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
...@@ -215,6 +209,11 @@ ...@@ -215,6 +209,11 @@
namespace ck { namespace ck {
enum struct ArchEnum
{
Gfx90a,
};
enum struct InMemoryDataOperationEnum enum struct InMemoryDataOperationEnum
{ {
Set, Set,
......
...@@ -78,7 +78,9 @@ struct StaticTensor ...@@ -78,7 +78,9 @@ struct StaticTensor
StaticBuffer<AddressSpace, T, element_space_size_, true> data_; StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
static constexpr T zero_scalar_value_ = T{0}; static constexpr T zero_scalar_value_ = T{0};
// for read access of invalid element
const T invalid_element_scalar_value_; const T invalid_element_scalar_value_;
// for write access of invalid element
T ignored_element_scalar_; T ignored_element_scalar_;
}; };
...@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer ...@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer
using V = vector_type<S, ScalarPerVector>; using V = vector_type<S, ScalarPerVector>;
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer()
: invalid_element_scalar_value_{0} : invalid_element_scalar_value_{0}, ignored_element_scalar_{0}
{ {
} }
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value) __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
: invalid_element_scalar_value_{invalid_element_value} : invalid_element_scalar_value_{invalid_element_value}, ignored_element_scalar_{0}
{ {
} }
...@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer ...@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer
StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_; StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
static constexpr S zero_scalar_value_ = S{0}; static constexpr S zero_scalar_value_ = S{0};
// for read access of invalid element
const S invalid_element_scalar_value_ = S{0}; const S invalid_element_scalar_value_ = S{0};
// for write access of invalid element
S ignored_element_scalar_; S ignored_element_scalar_;
}; };
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/tensor_coordinate.hpp"
namespace ck {
template <typename BufferView_, typename TensorDesc_>
struct TensorView
{
using BufferView = remove_reference_t<BufferView_>;
using DataType = typename BufferView::type;
using TensorDesc = remove_cvref_t<TensorDesc_>;
using TensorIndex = Array<index_t, TensorDesc::GetNumOfTopDimension()>;
using TensorCoord = decltype(make_tensor_coordinate(TensorDesc{}, TensorIndex{}));
__host__ __device__ constexpr TensorView() = default;
__host__ __device__ constexpr TensorView(const BufferView& buffer_view, const TensorDesc& desc)
: buf_{buffer_view}, desc_{desc}
{
}
__host__ __device__ constexpr auto& GetTensorDescriptor() const { return desc_; }
__host__ __device__ static constexpr index_t GetNumOfDimension()
{
return TensorDesc::GetNumOfTopDimension();
}
__host__ __device__ constexpr const auto& GetBufferView() const { return buf_; }
__host__ __device__ constexpr auto& GetBufferView() { return buf_; }
__host__ __device__ constexpr DataType GetElement(const TensorCoord& coord) const
{
return buf_.template Get<DataType>(
coord.GetOffset(),
coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
}
__host__ __device__ constexpr void SetElement(const TensorCoord& coord, const DataType& x)
{
buf_.template Set<DataType>(
coord.GetOffset(),
coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
x);
}
// X is vector of DataType.
// "coord" is coordinate of DataType, not X. "coord" should be aligned to X
template <typename X,
typename enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<DataType>>::type>,
bool>::type = false>
__host__ __device__ constexpr remove_cvref_t<X>
GetVectorizedElements(const TensorCoord& coord) const
{
return buf_.template Get<X>(
coord.GetOffset(),
coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
}
// X is vector of DataType.
// "coord" is coordinate of DataType, not X. "coord" should be aligned to X
template <typename X,
typename enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<DataType>>::type>,
bool>::type = false>
__host__ __device__ constexpr void SetVectorizedElements(const TensorCoord& coord, const X& x)
{
buf_.template Set<X>(coord.GetOffset(),
coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
x);
}
__host__ __device__ void Print() const
{
printf("TensorView{");
// buf_
printf("buf_: ");
print(buf_);
printf(", ");
// desc_
printf("desc_: ");
print(desc_);
printf("}");
}
// member
BufferView buf_;
TensorDesc desc_;
};
template <AddressSpaceEnum BufferAddressSpace = AddressSpaceEnum::Generic,
typename DataType,
typename... Ts>
__host__ __device__ constexpr auto make_tensor_view(DataType* p,
const TensorDescriptor<Ts...>& desc)
{
auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.GetElementSpaceSize());
return TensorView<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
}
template <AddressSpaceEnum BufferAddressSpace = AddressSpaceEnum::Generic,
typename DataType,
typename... Lengths,
typename... Strides,
index_t GuaranteedLastDimensionVectorLength = -1,
index_t GuaranteedLastDimensionVectorStride = -1,
typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
__host__ __device__ constexpr auto
make_naive_tensor_view(DataType* p,
const Tuple<Lengths...>& lengths,
const Tuple<Strides...>& strides,
Number<GuaranteedLastDimensionVectorLength> = Number<-1>{},
Number<GuaranteedLastDimensionVectorStride> = Number<-1>{})
{
auto desc = make_naive_tensor_descriptor(lengths,
strides,
Number<GuaranteedLastDimensionVectorLength>{},
Number<GuaranteedLastDimensionVectorStride>{});
auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.GetElementSpaceSize());
return TensorView<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
}
template <AddressSpaceEnum BufferAddressSpace = AddressSpaceEnum::Generic,
typename DataType,
typename... Lengths,
index_t GuaranteedLastDimensionVectorLength = -1>
__host__ __device__ constexpr auto
make_naive_tensor_view_packed(DataType* p,
const Tuple<Lengths...>& lengths,
Number<GuaranteedLastDimensionVectorLength> = Number<-1>{})
{
auto desc =
make_naive_tensor_descriptor_packed(lengths, Number<GuaranteedLastDimensionVectorLength>{});
auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.GetElementSpaceSize());
return TensorView<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
}
template <typename OldTensorView,
typename NewTransforms,
typename NewLowerDimensionOldVisibleIdss,
typename NewUpperDimensionNewVisibleIdss>
__host__ __device__ constexpr auto transform_tensor_view(const OldTensorView& old_tensor_view,
const NewTransforms& new_transforms,
NewLowerDimensionOldVisibleIdss,
NewUpperDimensionNewVisibleIdss)
{
auto new_desc = transform_tensor_descriptor(old_tensor_view.desc_,
new_transforms,
NewLowerDimensionOldVisibleIdss{},
NewUpperDimensionNewVisibleIdss{});
return TensorView<typename OldTensorView::BufferView, remove_cvref_t<decltype(new_desc)>>{
old_tensor_view.buf_, new_desc};
}
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/buffer_view.hpp"
namespace ck {
template <typename DataType, typename TensorDescriptor>
struct ThreadPrivateTensor
{
using DataType_ = DataType;
using T = DataType_;
using TensorDescriptor_ = remove_cvref_t<TensorDescriptor>;
__host__ __device__ constexpr ThreadPrivateTensor()
// FIXME: remove static_cast
: desc_{}, buf_view_{p_data_, static_cast<index_t>(desc_.GetElementSpaceSize())}
{
}
__host__ __device__ constexpr ThreadPrivateTensor(const TensorDescriptor_& desc)
// FIXME: remove static_cast
: desc_{desc}, buf_view_{p_data_, static_cast<index_t>(desc_.GetElementSpaceSize())}
{
}
// member
TensorDescriptor_ desc_;
//
static constexpr index_t kMaxBufferSize_ = 32;
DataType_ p_data_[kMaxBufferSize_];
// FIXME: remove assumption that type of BufferSize should be index_t
BufferView<AddressSpaceEnum::Vgpr, DataType_, index_t, true> buf_view_;
// function
// FIXME: doesn't do is_valid check
template <typename Idx>
__device__ constexpr const T& operator[](const Idx& idx) const
{
#if 1 // debug
// FIXME: remove to_multi_index
const auto coord = make_tensor_coordinate(desc_, to_multi_index(idx));
const index_t offset = coord.GetOffset();
return buf_view_[offset];
#else
constexpr auto coord = make_tensor_coordinate(TensorDescriptor_{}, to_multi_index(Idx{}));
constexpr index_t offset = coord.GetOffset();
return buf_view_[offset];
#endif
}
// FIXME: doesn't do is_valid check
template <typename Idx>
__device__ constexpr T& operator()(const Idx& idx)
{
#if 1 // debug
// FIXME: remove to_multi_index
const auto coord = make_tensor_coordinate(desc_, to_multi_index(idx));
const index_t offset = coord.GetOffset();
return buf_view_(offset);
#else
constexpr auto coord = make_tensor_coordinate(TensorDescriptor_{}, to_multi_index(Idx{}));
constexpr index_t offset = coord.GetOffset();
return buf_view_(offset);
#endif
}
// idx is the index of T, not X. idx should be aligned to X
template <typename X, typename Idx>
__device__ constexpr X Get(const Idx& idx) const
{
#if 1 // debug
const auto coord = make_tensor_coordinate(desc_, idx);
const index_t offset = coord.GetOffset();
const bool is_valid = coordinate_has_valid_offset(desc_, coord);
return buf_view_.template Get<X>(offset, is_valid);
#else
constexpr auto coord = make_tensor_coordinate(TensorDescriptor_{}, to_multi_index(Idx{}));
constexpr index_t offset = coord.GetOffset();
constexpr bool is_valid = coordinate_has_valid_offset(TensorDescriptor_{}, coord);
return buf_view_.template Get<X>(offset, is_valid);
#endif
}
// idx is the index of T, not X. idx should be aligned to X
template <typename X, typename Idx>
__device__ void Set(const Idx& idx, const X& x)
{
#if 1 // debug
const auto coord = make_tensor_coordinate(desc_, idx);
const index_t offset = coord.GetOffset();
const bool is_valid = coordinate_has_valid_offset(desc_, coord);
buf_view_.template Set<X>(offset, is_valid, x);
#else
constexpr auto coord = make_tensor_coordinate(TensorDescriptor_{}, to_multi_index(Idx{}));
constexpr index_t offset = coord.GetOffset();
constexpr bool is_valid = coordinate_has_valid_offset(TensorDescriptor_{}, coord);
buf_view_.template Set<X>(offset, is_valid, x);
#endif
}
};
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
// Macro function
// construct constexpr TensorAdaptor from constexpr encoding
// encoded_tensor_adaptor are Tuple of following objects:
// 1. encoded transforms (Array of fixed size). Each encoded transform is a Tuple of following:
// 1.1 name (IndexTransformEnum)
// 1.2 meta data for constructor of the transform
// 1.3 num of lower dimension (index_t)
// 1.4 lower dimension Ids (Array of fixed size)
// 1.5 num of up dimension (index_t)
// 1.6 upper dimension Ids (Array of fixed size)
// 2. num of transforms (index_t)
// 3. encoded bottom dimension Ids (Array of fixed size)
// 4. num of bottom dimension (index_t)
// 5. encoded top dimension Ids (Array of fixed size)
// 6. num of top dimension (index_t)
#define CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(encoded_tensor_adaptor) \
[encoded_tensor_adaptor]() { \
using namespace ck; \
\
constexpr auto encoded_transforms = encoded_tensor_adaptor.template At<0>(); \
constexpr index_t num_transform = encoded_tensor_adaptor.template At<1>(); \
constexpr auto encoded_bottom_dims = encoded_tensor_adaptor.template At<2>(); \
constexpr index_t num_bottom_dim = encoded_tensor_adaptor.template At<3>(); \
constexpr auto encoded_top_dims = encoded_tensor_adaptor.template At<4>(); \
constexpr index_t num_top_dim = encoded_tensor_adaptor.template At<5>(); \
\
constexpr auto trans = [&encoded_transforms, &num_transform]() { \
return generate_tuple( \
[&encoded_transforms](auto i) constexpr { \
constexpr auto name = encoded_transforms[i].template At<0>(); \
constexpr auto meta_data = encoded_transforms[i].template At<1>(); \
constexpr auto num_low_dim = encoded_transforms[i].template At<2>(); \
constexpr auto num_up_dim = encoded_transforms[i].template At<4>(); \
\
STATIC_ASSERT(name == IndexTransformEnum::PassThrough || \
name == IndexTransformEnum::Pad || \
name == IndexTransformEnum::Embed || \
name == IndexTransformEnum::Merge || \
name == IndexTransformEnum::UnMerge, \
""); \
\
if constexpr(name == IndexTransformEnum::PassThrough) \
{ \
index_t pos = 0; \
auto low_len = meta_data.template Pop<index_t>(pos); \
\
return make_pass_through_transform(low_len); \
} \
else if constexpr(name == IndexTransformEnum::Pad) \
{ \
index_t pos = 0; \
auto low_len = meta_data.template Pop<index_t>(pos); \
auto left_pad = meta_data.template Pop<index_t>(pos); \
auto right_pad = meta_data.template Pop<index_t>(pos); \
\
return make_pad_transform(low_len, left_pad, right_pad); \
} \
else if constexpr(name == IndexTransformEnum::Embed) \
{ \
index_t pos = 0; \
auto up_lens = meta_data.template Pop<Array<index_t, num_up_dim>>(pos); \
auto coefficients = \
meta_data.template Pop<Array<index_t, num_up_dim>>(pos); \
\
return make_embed_transform(up_lens, coefficients); \
} \
else if constexpr(name == IndexTransformEnum::Merge) \
{ \
index_t pos = 0; \
auto low_lens = meta_data.template Pop<Array<index_t, num_low_dim>>(pos); \
\
return make_merge_transform(low_lens); \
} \
else if constexpr(name == IndexTransformEnum::UnMerge) \
{ \
index_t pos = 0; \
auto up_lens = meta_data.template Pop<Array<index_t, num_up_dim>>(pos); \
\
return make_unmerge_transform(up_lens); \
} \
else if constexpr(name == IndexTransformEnum::Replicate) \
{ \
index_t pos = 0; \
auto up_lens = meta_data.template Pop<Array<index_t, num_up_dim>>(pos); \
\
return make_replicate_transform(up_lens); \
} \
}, \
Number<num_transform>{}); \
}(); \
\
constexpr auto low_dim_idss = [&encoded_transforms, &num_transform]() { \
return generate_tuple( \
[&encoded_transforms](auto i) { \
constexpr auto num_low_dim = encoded_transforms[i].template At<2>(); \
constexpr auto low_dims = encoded_transforms[i].template At<3>(); \
\
return TO_SEQUENCE(low_dims, num_low_dim); \
}, \
Number<num_transform>()); \
}(); \
\
constexpr auto up_dim_idss = [&encoded_transforms, &num_transform] { \
return generate_tuple( \
[&encoded_transforms](auto i) { \
constexpr auto num_up_dim = encoded_transforms[i].template At<4>(); \
constexpr auto up_dims = encoded_transforms[i].template At<5>(); \
\
return TO_SEQUENCE(up_dims, num_up_dim); \
}, \
Number<num_transform>()); \
}(); \
\
constexpr auto bottom_dim_ids = TO_SEQUENCE(encoded_bottom_dims, num_bottom_dim); \
constexpr auto top_dim_ids = TO_SEQUENCE(encoded_top_dims, num_top_dim); \
\
return TensorAdaptor<remove_cvref_t<decltype(trans)>, \
remove_cvref_t<decltype(low_dim_idss)>, \
remove_cvref_t<decltype(up_dim_idss)>, \
remove_cvref_t<decltype(bottom_dim_ids)>, \
remove_cvref_t<decltype(top_dim_ids)>>{trans}; \
}()
// Macro function
// construct static TensorAdaptor from constexpr encoding
// encoded_tensor_adaptor are Tuple of following objects:
// 1. encoded transforms (Array of fixed size). Each encoded transform is a Tuple of following:
// 1.1 name (IndexTransformEnum)
// 1.2 meta data for constructor of the transform
// 1.3 num of lower dimension (index_t)
// 1.4 lower dimension Ids (Array of fixed size)
// 1.5 num of up dimension (index_t)
// 1.6 upper dimension Ids (Array of fixed size)
// 2. num of transforms (index_t)
// 3. encoded bottom dimension Ids (Array of fixed size)
// 4. num of bottom dimension (index_t)
// 5. encoded top dimension Ids (Array of fixed size)
// 6. num of top dimension (index_t)
#define CONSTRUCT_STATIC_TENSOR_ADAPTOR_FROM_ENCODING(encoded_tensor_adaptor) \
[encoded_tensor_adaptor]() { \
using namespace ck; \
\
constexpr auto encoded_transforms = encoded_tensor_adaptor.template At<0>(); \
constexpr index_t num_transform = encoded_tensor_adaptor.template At<1>(); \
constexpr auto encoded_bottom_dims = encoded_tensor_adaptor.template At<2>(); \
constexpr index_t num_bottom_dim = encoded_tensor_adaptor.template At<3>(); \
constexpr auto encoded_top_dims = encoded_tensor_adaptor.template At<4>(); \
constexpr index_t num_top_dim = encoded_tensor_adaptor.template At<5>(); \
\
constexpr auto trans = [&encoded_transforms, &num_transform]() { \
return generate_tuple( \
[&encoded_transforms](auto i) constexpr { \
constexpr auto name = encoded_transforms[i].template At<0>(); \
constexpr auto meta_data = encoded_transforms[i].template At<1>(); \
constexpr auto num_low_dim = encoded_transforms[i].template At<2>(); \
constexpr auto num_up_dim = encoded_transforms[i].template At<4>(); \
\
STATIC_ASSERT(name == IndexTransformEnum::PassThrough || \
name == IndexTransformEnum::Pad || \
name == IndexTransformEnum::Embed || \
name == IndexTransformEnum::Merge || \
name == IndexTransformEnum::UnMerge, \
""); \
\
if constexpr(name == IndexTransformEnum::PassThrough) \
{ \
constexpr index_t low_len = meta_data.template Get<index_t>(0); \
\
return make_pass_through_transform(Number<low_len>{}); \
} \
else if constexpr(name == IndexTransformEnum::Pad) \
{ \
constexpr index_t low_len = meta_data.template Get<index_t>(0); \
\
constexpr index_t left_pad = \
meta_data.template Get<index_t>(sizeof(low_len)); \
\
constexpr index_t right_pad = \
meta_data.template Pop<index_t>(sizeof(low_len) + sizeof(left_pad)); \
\
return make_pad_transform( \
Number<low_len>{}, Number<left_pad>{}, Number<right_pad>{}); \
} \
else if constexpr(name == IndexTransformEnum::Embed) \
{ \
constexpr auto up_lens = \
meta_data.template Get<Array<index_t, num_up_dim>>(0); \
\
constexpr auto coefficients = \
meta_data.template Get<Array<index_t, num_up_dim>>(sizeof(up_lens)); \
\
return make_embed_transform(TO_TUPLE_OF_NUMBER(up_lens, num_up_dim), \
TO_TUPLE_OF_NUMBER(coefficients, num_up_dim)); \
} \
else if constexpr(name == IndexTransformEnum::Merge) \
{ \
constexpr auto low_lens = \
meta_data.template Get<Array<index_t, num_low_dim>>(0); \
\
return make_merge_transform(TO_TUPLE_OF_NUMBER(low_lens, num_low_dim)); \
} \
else if constexpr(name == IndexTransformEnum::UnMerge) \
{ \
constexpr auto up_lens = \
meta_data.template Get<Array<index_t, num_up_dim>>(0); \
\
return make_unmerge_transform(TO_TUPLE_OF_NUMBER(up_lens, num_up_dim)); \
} \
else if constexpr(name == IndexTransformEnum::Replicate) \
{ \
constexpr auto up_lens = \
meta_data.template Get<Array<index_t, num_up_dim>>(0); \
\
return make_replicate_transform(TO_TUPLE_OF_NUMBER(up_lens, num_up_dim)); \
} \
}, \
Number<num_transform>{}); \
}(); \
\
constexpr auto low_dim_idss = [&encoded_transforms, &num_transform]() { \
return generate_tuple( \
[&encoded_transforms](auto i) { \
constexpr auto num_low_dim = encoded_transforms[i].template At<2>(); \
constexpr auto low_dims = encoded_transforms[i].template At<3>(); \
\
return TO_SEQUENCE(low_dims, num_low_dim); \
}, \
Number<num_transform>()); \
}(); \
\
constexpr auto up_dim_idss = [&encoded_transforms, &num_transform] { \
return generate_tuple( \
[&encoded_transforms](auto i) { \
constexpr auto num_up_dim = encoded_transforms[i].template At<4>(); \
constexpr auto up_dims = encoded_transforms[i].template At<5>(); \
\
return TO_SEQUENCE(up_dims, num_up_dim); \
}, \
Number<num_transform>()); \
}(); \
\
constexpr auto bottom_dim_ids = TO_SEQUENCE(encoded_bottom_dims, num_bottom_dim); \
constexpr auto top_dim_ids = TO_SEQUENCE(encoded_top_dims, num_top_dim); \
\
return TensorAdaptor<remove_cvref_t<decltype(trans)>, \
remove_cvref_t<decltype(low_dim_idss)>, \
remove_cvref_t<decltype(up_dim_idss)>, \
remove_cvref_t<decltype(bottom_dim_ids)>, \
remove_cvref_t<decltype(top_dim_ids)>>{trans}; \
}()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment