Unverified Commit dc70e3e1 authored by arai713's avatar arai713 Committed by GitHub
Browse files

Merge branch 'develop' into gridwise_2d

parents 10947a54 8ee36118
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck {
namespace tensor_operation {
namespace device {
template <index_t K_BlockTileSize, index_t KThreadSliceSize>
struct GetReduceCountPerThreadForBlockwiseWelford
{
GetReduceCountPerThreadForBlockwiseWelford(index_t numBlockTileIteration,
long_index_t reduce_length)
: numBlockTileIteration_{numBlockTileIteration}
{
count_in_last_tile_ = reduce_length % K_BlockTileSize;
};
__device__ index_t operator()(index_t thread_k_cluster_id) const
{
if(count_in_last_tile_ == 0)
return (KThreadSliceSize * numBlockTileIteration_);
else
{
index_t num_complete_slice = count_in_last_tile_ / KThreadSliceSize;
index_t count_in_last_slice = count_in_last_tile_ % KThreadSliceSize;
if(thread_k_cluster_id < num_complete_slice)
return (KThreadSliceSize * numBlockTileIteration_);
else if(thread_k_cluster_id == num_complete_slice)
return (KThreadSliceSize * (numBlockTileIteration_ - 1) + count_in_last_slice);
else
return (KThreadSliceSize * (numBlockTileIteration_ - 1));
};
};
index_t numBlockTileIteration_;
index_t count_in_last_tile_;
};
template <index_t K_BlockTileSize, index_t KThreadSliceSize>
struct GetReduceCountPerThreadForMultiblockWelford
{
GetReduceCountPerThreadForMultiblockWelford(index_t blkGroupSize,
index_t numBlockTileIteration,
long_index_t reduce_length)
: blkGroupSize_(blkGroupSize), numBlockTileIteration_{numBlockTileIteration}
{
last_block_reduce_length_ =
reduce_length - K_BlockTileSize * numBlockTileIteration_ * (blkGroupSize_ - 1);
numBlockTileIterationByLastBlock_ =
(last_block_reduce_length_ + K_BlockTileSize - 1) / K_BlockTileSize;
};
__device__ index_t operator()(index_t block_local_id, index_t thread_k_cluster_id) const
{
if(last_block_reduce_length_ == K_BlockTileSize * numBlockTileIteration_ ||
block_local_id < blkGroupSize_ - 1)
return (KThreadSliceSize * numBlockTileIteration_);
index_t count_in_last_tile = last_block_reduce_length_ % K_BlockTileSize;
if(count_in_last_tile == 0)
return (KThreadSliceSize * numBlockTileIterationByLastBlock_);
else
{
index_t num_complete_slice = count_in_last_tile / KThreadSliceSize;
if(thread_k_cluster_id < num_complete_slice)
return (KThreadSliceSize * numBlockTileIterationByLastBlock_);
else if(thread_k_cluster_id == num_complete_slice)
return (KThreadSliceSize * (numBlockTileIterationByLastBlock_ - 1) +
count_in_last_tile);
else
return (KThreadSliceSize * (numBlockTileIterationByLastBlock_ - 1));
};
};
index_t blkGroupSize_;
index_t numBlockTileIteration_;
index_t last_block_reduce_length_;
index_t numBlockTileIterationByLastBlock_;
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -66,6 +66,7 @@ template <index_t BlockSize,
index_t MPerBlock,
index_t NPerBlock,
index_t K0PerBlock,
index_t K1Value,
index_t M1PerThreadM111,
index_t N1PerThreadN111,
index_t KPerThread,
......@@ -96,7 +97,7 @@ struct GridwiseGemmDl_km_kn_mn_v1r3
static constexpr auto I3 = Number<3>{};
// K1 should be Number<...>
static constexpr auto K1 = AGridDesc_K0_M_K1{}.GetLength(I2);
static constexpr auto K1 = Number<K1Value>{};
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
......
......@@ -3,6 +3,7 @@
#pragma once
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace ck {
......
......@@ -593,7 +593,8 @@ struct XdlopsGemm
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
using CIndex = MultiIndex<2>;
using CIndex = MultiIndex<2>;
using CIndex4D = MultiIndex<4>;
__device__ static constexpr index_t GetNumBlks() { return mfma_instr.num_output_blks; }
......@@ -822,6 +823,16 @@ struct XdlopsGemm
return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset};
}
__device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */)
{
const auto blk_idx = GetBlkIdx();
const auto blk_id = blk_idx[I0];
const auto blk_td = blk_idx[I1];
return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
}
static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops>{};
static constexpr auto mfma_instr = mfma.selected_mfma;
......
......@@ -6,6 +6,7 @@ function(add_instance_library INSTANCE_NAME)
clang_tidy_check(${INSTANCE_NAME})
endfunction(add_instance_library INSTANCE_NAME)
file(GLOB dir_list LIST_DIRECTORIES true *)
set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment