Commit e69b1970 authored by ChLiu Chao's avatar ChLiu Chao
Browse files

testing on v100

parent eb68e34c
......@@ -164,6 +164,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
constexpr index_t KBlockWork = K / KPerBlock;
constexpr index_t BBlockWork = B / BPerBlock;
#if 0
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
......@@ -171,6 +172,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
#else
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<BBlockWork, KBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t b_block_data_on_global = block_work_id[0] * BPerBlock;
const index_t k_block_data_on_global = block_work_id[1] * KPerBlock;
#endif
// input tensor
// global tensor in global memory
......
......@@ -75,6 +75,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
#if 0
// sanity-check for vectorized memory load
static_assert((Wo == 1 || (ConvStrideW == 1 || GemmBBlockCopySrcDataPerRead_GemmN == 1)) &&
(X == 1 || ConvDilationW % GemmBBlockCopySrcDataPerRead_GemmN == 0) &&
......@@ -82,6 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
InRightPads{}[1] % GemmBBlockCopySrcDataPerRead_GemmN == 0,
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated");
#endif
// weight tensor
constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
......
......@@ -111,6 +111,7 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
constexpr index_t MBlockWork = M / MPerBlock;
constexpr index_t NBlockWork = N / NPerBlock;
#if 1
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});
......@@ -118,6 +119,15 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
#else
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<NBlockWork, MBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t n_block_data_on_global = block_work_id[0] * NPerBlock;
const index_t m_block_data_on_global = block_work_id[1] * MPerBlock;
#endif
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
......
#pragma once
1111gma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment