Commit e69b1970 authored by ChLiu Chao's avatar ChLiu Chao
Browse files

testing on v100

parent eb68e34c
...@@ -164,6 +164,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -164,6 +164,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
constexpr index_t KBlockWork = K / KPerBlock; constexpr index_t KBlockWork = K / KPerBlock;
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
#if 0
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{}); make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
...@@ -171,6 +172,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -171,6 +172,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
#else
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<BBlockWork, KBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t b_block_data_on_global = block_work_id[0] * BPerBlock;
const index_t k_block_data_on_global = block_work_id[1] * KPerBlock;
#endif
// input tensor // input tensor
// global tensor in global memory // global tensor in global memory
......
...@@ -75,6 +75,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -75,6 +75,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
constexpr index_t ConvDilationH = ConvDilations{}[0]; constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1]; constexpr index_t ConvDilationW = ConvDilations{}[1];
#if 0
// sanity-check for vectorized memory load // sanity-check for vectorized memory load
static_assert((Wo == 1 || (ConvStrideW == 1 || GemmBBlockCopySrcDataPerRead_GemmN == 1)) && static_assert((Wo == 1 || (ConvStrideW == 1 || GemmBBlockCopySrcDataPerRead_GemmN == 1)) &&
(X == 1 || ConvDilationW % GemmBBlockCopySrcDataPerRead_GemmN == 0) && (X == 1 || ConvDilationW % GemmBBlockCopySrcDataPerRead_GemmN == 0) &&
...@@ -82,6 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -82,6 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
InRightPads{}[1] % GemmBBlockCopySrcDataPerRead_GemmN == 0, InRightPads{}[1] % GemmBBlockCopySrcDataPerRead_GemmN == 0,
"wrong! aligment requirement for vectorized global load of input tensor will " "wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"); "be violated");
#endif
// weight tensor // weight tensor
constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
......
...@@ -111,6 +111,7 @@ struct GridwiseGemmTransposedANormalBNormalC_v1 ...@@ -111,6 +111,7 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
constexpr index_t MBlockWork = M / MPerBlock; constexpr index_t MBlockWork = M / MPerBlock;
constexpr index_t NBlockWork = N / NPerBlock; constexpr index_t NBlockWork = N / NPerBlock;
#if 1
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{}); make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});
...@@ -118,6 +119,15 @@ struct GridwiseGemmTransposedANormalBNormalC_v1 ...@@ -118,6 +119,15 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
const index_t m_block_data_on_global = block_work_id[0] * MPerBlock; const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
const index_t n_block_data_on_global = block_work_id[1] * NPerBlock; const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
#else
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<NBlockWork, MBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t n_block_data_on_global = block_work_id[0] * NPerBlock;
const index_t m_block_data_on_global = block_work_id[1] * MPerBlock;
#endif
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
......
#pragma once 1111gma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "tensor.hpp" #include "tensor.hpp"
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment