Commit 92661018 authored by Chao Liu's avatar Chao Liu
Browse files

debugging scatch mem issue

parent adc4f1bf
...@@ -402,7 +402,7 @@ struct DummyDynamicTransform_v1 ...@@ -402,7 +402,7 @@ struct DummyDynamicTransform_v1
C, C,
Y, Y,
X); X);
#elif 0 #elif 1
index_t tmp = idx_diff[15]; index_t tmp = idx_diff[15];
const index_t const_tmp_0 = tmp / (Y * X); const index_t const_tmp_0 = tmp / (Y * X);
tmp -= const_tmp_0 * (Y * X); tmp -= const_tmp_0 * (Y * X);
...@@ -422,7 +422,7 @@ struct DummyDynamicTransform_v1 ...@@ -422,7 +422,7 @@ struct DummyDynamicTransform_v1
C, C,
Y, Y,
X); X);
#elif 1 #elif 0
index_t tmp = idx_diff[15]; index_t tmp = idx_diff[15];
const index_t const_tmp_0 = __llvm_amdgcn_readfirstlane_i32(tmp / (Y * X)); const index_t const_tmp_0 = __llvm_amdgcn_readfirstlane_i32(tmp / (Y * X));
tmp -= const_tmp_0 * (Y * X); tmp -= const_tmp_0 * (Y * X);
......
...@@ -119,7 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -119,7 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
// blockwise atomic accumulation // blockwise atomic accumulation
auto blockwise_copy = auto blockwise_copy =
#if 0 #if 1
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize, BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float, float,
float, float,
...@@ -139,26 +139,26 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -139,26 +139,26 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
1, 1,
1>( 1>(
#else #else
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize, BlockwiseDynamicTensorSliceTransfer_v2<BlockSize,
float, float,
float, float,
decltype(col_gemmk_gemmn_global_desc), decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc), decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>, Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN, BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN, BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder, BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder, BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder, BlockCopyDstAccessOrder,
1, 1,
1, 1,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
AddressSpace::Global, AddressSpace::Global,
AddressSpace::Global, AddressSpace::Global,
InMemoryDataOperation::AtomicAdd, InMemoryDataOperation::AtomicAdd,
1, 1,
1>( 1>(
#endif #endif
col_gemmk_gemmn_global_desc, col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global), make_multi_index(0, gemmn_block_data_on_global),
......
#ifndef CK_DYNAMIC_GRIDWISE_COPY_GEMMKGEMMN_HPP
#define CK_DYNAMIC_GRIDWISE_COPY_GEMMKGEMMN_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
index_t GemmKPerBlock,
index_t GemmNPerBlock,
typename BlockCopySubLengths_GemmK_GemmN,
typename BlockCopyClusterLengths_GemmK_GemmN,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCopy_gemmkgemmn
{
#if 1
template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const index_t GemmK = src_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = src_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy =
#if 0
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
#else
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
#endif
src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(p_src_global, p_dst_global);
blockwise_copy.MoveSrcSliceWindow(make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
}
}
#else
template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const index_t GemmK = src_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = src_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
// divide thread work by GemmK, GemmN
static constexpr auto thread_cluster_desc = make_cluster_descriptor(
BlockCopyClusterLengths_GemmK_GemmN{}, BlockCopyThreadClusterArrangeOrder{});
const auto thread_work_id =
thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());
// gemmk, gemmn
constexpr index_t GemmKPerThread = BlockCopySubLengths_GemmK_GemmN::At(I0);
constexpr index_t GemmNPerThread = BlockCopySubLengths_GemmK_GemmN::At(I1);
const index_t gemmk_thread_data_on_global =
thread_work_id[I0] * BlockCopySubLengths_GemmK_GemmN::At(I0);
const index_t gemmn_thread_data_on_global =
block_work_id * GemmNPerBlock +
thread_work_id[I1] * BlockCopySubLengths_GemmK_GemmN::At(I1);
auto src_coord = make_dynamic_tensor_coordinate(
src_gemmk_gemmn_global_desc,
make_multi_index(gemmk_thread_data_on_global, gemmn_thread_data_on_global));
auto dst_coord = make_dynamic_tensor_coordinate(
dst_gemmk_gemmn_global_desc,
make_multi_index(gemmk_thread_data_on_global, gemmn_thread_data_on_global));
threadwise_dynamic_tensor_slice_transfer_v1<float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
BlockCopySubLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
1,
1,
1,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(src_gemmk_gemmn_global_desc,
src_coord,
p_src_global,
dst_gemmk_gemmn_global_desc,
dst_coord,
p_dst_global);
}
#endif
};
} // namespace ck
#endif
...@@ -59,12 +59,13 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -59,12 +59,13 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
// hardcoded for 2d loop for now // hardcoded for 2d loop for now
#pragma unroll #pragma unroll
for(int j0 = 0; j0 < J0; ++j0) for(index_t j0 = 0; j0 < J0; ++j0)
{ {
#pragma unroll #pragma unroll
for(int j1 = 0; j1 < J1; ++j1) for(index_t j1 = 0; j1 < J1; ++j1)
{ {
// do work // do work
#if 0
transfer_data<SrcData, transfer_data<SrcData,
1, 1,
SrcAddressSpace, SrcAddressSpace,
...@@ -80,6 +81,35 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -80,6 +81,35 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
dst_coord.GetOffset(), dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord), coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize()); dst_desc.GetElementSpaceSize());
#else
SrcData tmp;
transfer_data<SrcData,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
1>(
p_src,
src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord),
src_desc.GetElementSpaceSize(),
&tmp,
0,
true,
1);
transfer_data<DstData, 1, AddressSpace::Vgpr, DstAddressSpace, DstInMemOp, 1, 1>(
&tmp,
0,
true,
1,
p_dst,
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize());
#endif
// move dim1 iterator // move dim1 iterator
if(j1 < J1 - 1) if(j1 < J1 - 1)
......
...@@ -85,7 +85,6 @@ ...@@ -85,7 +85,6 @@
#define CK_WORKAROUND_SWDEV_241664 1 #define CK_WORKAROUND_SWDEV_241664 1
#endif #endif
namespace ck { namespace ck {
enum AddressSpace enum AddressSpace
......
...@@ -17,17 +17,21 @@ install(TARGETS host LIBRARY DESTINATION lib) ...@@ -17,17 +17,21 @@ install(TARGETS host LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD") if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp) set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp) set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
set(TRY_SOURCE src/col2im_driver.cpp) set(COL2IM_SOURCE src/col2im_driver.cpp)
set(COPY_SOURCE src/copy_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu) set(CONV_SOURCE src/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu) set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
set(TRY_SOURCE src/col2im_driver.cu) set(COL2IM_SOURCE src/col2im_driver.cu)
set(COPY_SOURCE src/copy_driver.cu)
endif() endif()
add_executable(conv_driver ${CONV_SOURCE}) add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
add_executable(col2im_driver ${TRY_SOURCE}) add_executable(col2im_driver ${COL2IM_SOURCE})
add_executable(copy_driver ${COPY_SOURCE})
target_link_libraries(conv_driver PRIVATE host) target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host) target_link_libraries(conv_bwd_data_driver PRIVATE host)
target_link_libraries(col2im_driver PRIVATE host) target_link_libraries(col2im_driver PRIVATE host)
target_link_libraries(copy_driver PRIVATE host)
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dynamic_gridwise_copy_gemmkgemmn.hpp"
template <typename T, typename SrcDesc, typename DstDesc>
void device_dynamic_copy(SrcDesc,
const Tensor<T>& src_gemmk_gemmn,
DstDesc,
Tensor<T>& dst_gemmk_gemmn)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
std::size_t data_sz = sizeof(T);
DeviceMem src_gemmk_gemmn_device_buf(data_sz * src_gemmk_gemmn.mDesc.GetElementSpace());
DeviceMem dst_gemmk_gemmn_device_buf(data_sz * dst_gemmk_gemmn.mDesc.GetElementSpace());
src_gemmk_gemmn_device_buf.ToDevice(src_gemmk_gemmn.mData.data());
const auto src_gemmk_gemmn_desc = make_dynamic_native_tensor_descriptor<2>(
to_multi_index(SrcDesc::GetLengths()), to_multi_index(SrcDesc::GetStrides()));
const auto dst_gemmk_gemmn_desc = make_dynamic_native_tensor_descriptor<2>(
to_multi_index(DstDesc::GetLengths()), to_multi_index(DstDesc::GetStrides()));
index_t GemmK = src_gemmk_gemmn_desc.GetLength(I0);
index_t GemmN = src_gemmk_gemmn_desc.GetLength(I1);
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmNPerBlock = 128;
using BlockCopySubLengths_GemmK_GemmN = Sequence<1, 8>;
using BlockCopyClusterLengths_GemmK_GemmN = Sequence<8, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
constexpr index_t BlockCopyDataPerAccess_GemmN = 1;
#endif
const index_t GridSize = GemmN / GemmNPerBlock;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_copy =
DynamicGridwiseCopy_gemmkgemmn<BlockSize,
GemmKPerBlock,
GemmNPerBlock,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_GemmN>{};
std::cout << "Start running " << std::endl;
launch_kernel(run_gridwise_operation<decltype(gridwise_copy),
const T* const __restrict__,
T* const __restrict__,
decltype(src_gemmk_gemmn_desc),
decltype(dst_gemmk_gemmn_desc)>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
const_cast<const T* const __restrict__>(
static_cast<T*>(src_gemmk_gemmn_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(dst_gemmk_gemmn_device_buf.GetDeviceBuffer())),
src_gemmk_gemmn_desc,
dst_gemmk_gemmn_desc);
dst_gemmk_gemmn_device_buf.FromDevice(dst_gemmk_gemmn.mData.data());
}
...@@ -561,7 +561,7 @@ int main(int argc, char* argv[]) ...@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "tensor_descriptor_helper.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_dynamic_copy_gemmkgemmn.hpp"
int main(int argc, char* argv[])
{
using namespace ck;
#if 1
constexpr index_t GemmK = 8;
constexpr index_t GemmN = 128;
#endif
constexpr auto src_gemmk_gemmn_desc =
make_native_tensor_descriptor_packed(Sequence<GemmK, GemmN>{});
constexpr auto dst_gemmk_gemmn_desc =
make_native_tensor_descriptor_packed(Sequence<GemmK, GemmN>{});
ostream_tensor_descriptor(src_gemmk_gemmn_desc, std::cout << "src_gemmk_gemmn_desc: ");
ostream_tensor_descriptor(dst_gemmk_gemmn_desc, std::cout << "dst_gemmk_gemmn_desc: ");
Tensor<float> src_gemmk_gemmn(make_HostTensorDescriptor(src_gemmk_gemmn_desc));
Tensor<float> dst_gemmk_gemmn_device(make_HostTensorDescriptor(dst_gemmk_gemmn_desc));
std::size_t num_thread = std::thread::hardware_concurrency();
if(argc != 3)
{
printf("arg1: do_verification, arg2: nrepeat\n");
exit(1);
}
bool do_verification = atoi(argv[1]);
std::size_t nrepeat = atoi(argv[2]);
if(do_verification)
{
#if 0
src_gemmk_gemmn.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else
src_gemmk_gemmn.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#endif
}
device_dynamic_copy(
src_gemmk_gemmn_desc, src_gemmk_gemmn, dst_gemmk_gemmn_desc, dst_gemmk_gemmn_device);
if(do_verification)
{
check_error(src_gemmk_gemmn, dst_gemmk_gemmn_device);
#if 0
LogRange(std::cout << "src_gemmk_gemmn : ", src_gemmk_gemmn.mData, ",") << std::endl;
LogRange(std::cout << "dst_gemmk_gemmn_device : ", dst_gemmk_gemmn_device.mData, ",") << std::endl;
#endif
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment