Commit dab53610 authored by Chao Liu's avatar Chao Liu
Browse files

remove copy_driver

parent 8e5085fa
#ifndef CK_DYNAMIC_GRIDWISE_COPY_GEMMKGEMMN_HPP
#define CK_DYNAMIC_GRIDWISE_COPY_GEMMKGEMMN_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
index_t GemmKPerBlock,
index_t GemmNPerBlock,
typename BlockCopySubLengths_GemmK_GemmN,
typename BlockCopyClusterLengths_GemmK_GemmN,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_GemmN>
struct DynamicGridwiseCopy_gemmkgemmn
{
template <typename... Src, typename... Dst>
__device__ void Run_r1(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const index_t GemmK = src_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = src_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy =
#if 0
BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>
#elif 1
BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>
#endif
(src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(p_src_global, p_dst_global);
blockwise_copy.MoveSrcSliceWindow(make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
}
}
template <typename... Src, typename... Dst>
__device__ void Run_r2(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const index_t GemmK = src_gemmk_gemmn_global_desc.GetLength(I0);
const index_t GemmN = src_gemmk_gemmn_global_desc.GetLength(I1);
// divide block work by GemmN
const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
const index_t block_work_id = get_block_1d_id();
const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
// blockwise atomic accumulation
auto blockwise_copy =
BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
float,
float,
decltype(src_gemmk_gemmn_global_desc),
decltype(dst_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
src_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global),
dst_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global));
for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
{
blockwise_copy.Run(src_gemmk_gemmn_global_desc,
p_src_global,
dst_gemmk_gemmn_global_desc,
p_dst_global);
blockwise_copy.MoveSrcSliceWindow(src_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
blockwise_copy.MoveDstSliceWindow(dst_gemmk_gemmn_global_desc,
make_multi_index(GemmKPerBlock, 0));
}
}
template <typename... Src, typename... Dst>
__device__ void Run(const float* const __restrict__ p_src_global,
float* const __restrict__ p_dst_global,
const DynamicTensorDescriptor<Src...>& src_gemmk_gemmn_global_desc,
const DynamicTensorDescriptor<Dst...>& dst_gemmk_gemmn_global_desc) const
{
Run_r2(
p_src_global, p_dst_global, src_gemmk_gemmn_global_desc, dst_gemmk_gemmn_global_desc);
}
};
} // namespace ck
#endif
......@@ -18,20 +18,16 @@ if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
set(COL2IM_SOURCE src/col2im_driver.cpp)
set(COPY_SOURCE src/copy_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
set(COL2IM_SOURCE src/col2im_driver.cu)
set(COPY_SOURCE src/copy_driver.cu)
endif()
add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
add_executable(col2im_driver ${COL2IM_SOURCE})
add_executable(copy_driver ${COPY_SOURCE})
target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host)
target_link_libraries(col2im_driver PRIVATE host)
target_link_libraries(copy_driver PRIVATE host)
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dynamic_gridwise_copy_gemmkgemmn.hpp"
template <typename T, typename SrcDesc, typename DstDesc>
void device_dynamic_copy(SrcDesc,
const Tensor<T>& src_gemmk_gemmn,
DstDesc,
Tensor<T>& dst_gemmk_gemmn)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
std::size_t data_sz = sizeof(T);
DeviceMem src_gemmk_gemmn_device_buf(data_sz * src_gemmk_gemmn.mDesc.GetElementSpace());
DeviceMem dst_gemmk_gemmn_device_buf(data_sz * dst_gemmk_gemmn.mDesc.GetElementSpace());
src_gemmk_gemmn_device_buf.ToDevice(src_gemmk_gemmn.mData.data());
const auto src_gemmk_gemmn_desc = make_dynamic_native_tensor_descriptor<2>(
to_multi_index(SrcDesc::GetLengths()), to_multi_index(SrcDesc::GetStrides()));
const auto dst_gemmk_gemmn_desc = make_dynamic_native_tensor_descriptor<2>(
to_multi_index(DstDesc::GetLengths()), to_multi_index(DstDesc::GetStrides()));
index_t GemmK = src_gemmk_gemmn_desc.GetLength(I0);
index_t GemmN = src_gemmk_gemmn_desc.GetLength(I1);
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmNPerBlock = 128;
using BlockCopySubLengths_GemmK_GemmN = Sequence<1, 8>;
using BlockCopyClusterLengths_GemmK_GemmN = Sequence<8, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
constexpr index_t BlockCopyDataPerAccess_GemmN = 1;
#endif
const index_t GridSize = GemmN / GemmNPerBlock;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_copy =
DynamicGridwiseCopy_gemmkgemmn<BlockSize,
GemmKPerBlock,
GemmNPerBlock,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_GemmN>{};
std::cout << "Start running " << std::endl;
launch_kernel(run_gridwise_operation<decltype(gridwise_copy),
const T* const __restrict__,
T* const __restrict__,
decltype(src_gemmk_gemmn_desc),
decltype(dst_gemmk_gemmn_desc)>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
const_cast<const T* const __restrict__>(
static_cast<T*>(src_gemmk_gemmn_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(dst_gemmk_gemmn_device_buf.GetDeviceBuffer())),
src_gemmk_gemmn_desc,
dst_gemmk_gemmn_desc);
dst_gemmk_gemmn_device_buf.FromDevice(dst_gemmk_gemmn.mData.data());
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "tensor_descriptor_helper.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_dynamic_copy_gemmkgemmn.hpp"
int main(int argc, char* argv[])
{
using namespace ck;
#if 1
constexpr index_t GemmK = 8;
constexpr index_t GemmN = 128;
#endif
constexpr auto src_gemmk_gemmn_desc =
make_native_tensor_descriptor_packed(Sequence<GemmK, GemmN>{});
constexpr auto dst_gemmk_gemmn_desc =
make_native_tensor_descriptor_packed(Sequence<GemmK, GemmN>{});
ostream_tensor_descriptor(src_gemmk_gemmn_desc, std::cout << "src_gemmk_gemmn_desc: ");
ostream_tensor_descriptor(dst_gemmk_gemmn_desc, std::cout << "dst_gemmk_gemmn_desc: ");
Tensor<float> src_gemmk_gemmn(make_HostTensorDescriptor(src_gemmk_gemmn_desc));
Tensor<float> dst_gemmk_gemmn_device(make_HostTensorDescriptor(dst_gemmk_gemmn_desc));
std::size_t num_thread = std::thread::hardware_concurrency();
if(argc != 3)
{
printf("arg1: do_verification, arg2: nrepeat\n");
exit(1);
}
bool do_verification = atoi(argv[1]);
std::size_t nrepeat = atoi(argv[2]);
if(do_verification)
{
#if 0
src_gemmk_gemmn.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else
src_gemmk_gemmn.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#endif
}
device_dynamic_copy(
src_gemmk_gemmn_desc, src_gemmk_gemmn, dst_gemmk_gemmn_desc, dst_gemmk_gemmn_device);
if(do_verification)
{
check_error(src_gemmk_gemmn, dst_gemmk_gemmn_device);
#if 0
LogRange(std::cout << "src_gemmk_gemmn : ", src_gemmk_gemmn.mData, ",") << std::endl;
LogRange(std::cout << "dst_gemmk_gemmn_device : ", dst_gemmk_gemmn_device.mData, ",") << std::endl;
#endif
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment