Unverified Commit 394dbf83 authored by Haocong WANG's avatar Haocong WANG Committed by GitHub
Browse files

fix layernorm, reduction Ops (#4)



* [Navi3x] Fix Gridwise_multiple_d operation (#649)

* Add CMake Option "USE_OPT_NAVI3X"

* fix bug

* standardize docs (#655)

* Separate bibtex requirement from rocm-docs-core (#656)

* separate bibtex requirement from rocm-docs-core

* point requirements to source rocm-docs-core repo

* Add CMake Option "USE_OPT_NAVI3X" (#647)

* Add CMake Option "USE_OPT_NAVI3X"

* remove navi3x opt compile option from cmake script

* Conv + quantization + tanh  (#645)

* Rename file. Prepare to support another activation

* Add comment for quantization

* Extract out_elementop

* Add tanh example

* Add conv + bias + tanh quantization instance

* Add missing parameter

* Refine cmake

* Add external api and client example

* Extract variable in example

* Fix the comment

---------
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>

* Add a denorm test fix (#603)

* Add type_convert implementations for bf16

* Add the fix for conv_fwd
...
parent a0058be6
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
version: 2 version: 2
updates: updates:
- package-ecosystem: "pip" # See documentation for possible values - package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests directory: "/docs/.sphinx" # Location of package manifests
open-pull-requests-limit: 10 open-pull-requests-limit: 10
schedule: schedule:
interval: "daily" interval: "daily"
...@@ -19,7 +19,7 @@ using IndexDataType = int32_t; ...@@ -19,7 +19,7 @@ using IndexDataType = int32_t;
using InLayout = ck::tensor_layout::convolution::NHWC; using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC; using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1 #if 0
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else #else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
......
...@@ -77,12 +77,12 @@ using DeviceGemmInstance = ...@@ -77,12 +77,12 @@ using DeviceGemmInstance =
ADataType, ADataType,
B0DataType, B0DataType,
B1DataType, B1DataType,
CDataType,
Acc0BiasDataType, Acc0BiasDataType,
Acc0DataType, Acc0DataType,
Acc1BiasDataType, Acc1BiasDataType,
Acc1DataType, Acc1DataType,
CShuffleDataType, CShuffleDataType,
CDataType,
AElementOp, AElementOp,
B0ElementOp, B0ElementOp,
Acc0ElementOp, Acc0ElementOp,
...@@ -93,6 +93,7 @@ using DeviceGemmInstance = ...@@ -93,6 +93,7 @@ using DeviceGemmInstance =
TensorSpecB0, TensorSpecB0,
TensorSpecB1, TensorSpecB1,
TensorSpecC, TensorSpecC,
1,
256, 256,
// Gemm 0 // Gemm 0
128, // MPerBlock 128, // MPerBlock
......
...@@ -70,10 +70,10 @@ ...@@ -70,10 +70,10 @@
// TODO: enable buffer load when found correct 3rd dword // TODO: enable buffer load when found correct 3rd dword
// buffer load // buffer load
#define CK_USE_AMD_BUFFER_LOAD 0 #define CK_USE_AMD_BUFFER_LOAD 1
// buffer store // buffer store
#define CK_USE_AMD_BUFFER_STORE 0 #define CK_USE_AMD_BUFFER_STORE 1
// buffer atomic add: integer // buffer atomic add: integer
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
......
...@@ -34,12 +34,12 @@ template <index_t NumDimG, ...@@ -34,12 +34,12 @@ template <index_t NumDimG,
typename ADataType, typename ADataType,
typename B0DataType, typename B0DataType,
typename B1DataType, typename B1DataType,
typename CDataType,
typename Acc0BiasDataType, typename Acc0BiasDataType,
typename Acc0DataType, typename Acc0DataType,
typename Acc1BiasDataType, typename Acc1BiasDataType,
typename Acc1DataType, typename Acc1DataType,
typename CShuffleDataType, typename CShuffleDataType,
typename CDataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename B0ElementwiseOperation, typename B0ElementwiseOperation,
typename AccElementwiseOperation, typename AccElementwiseOperation,
...@@ -50,6 +50,7 @@ template <index_t NumDimG, ...@@ -50,6 +50,7 @@ template <index_t NumDimG,
TensorSpecialization B0Spec, TensorSpecialization B0Spec,
TensorSpecialization B1Spec, TensorSpecialization B1Spec,
TensorSpecialization CSpec, TensorSpecialization CSpec,
ck::index_t NumPrefetch,
ck::index_t BlockSize, ck::index_t BlockSize,
ck::index_t MPerBlock, ck::index_t MPerBlock,
ck::index_t LPerBlock, ck::index_t LPerBlock,
...@@ -90,7 +91,6 @@ template <index_t NumDimG, ...@@ -90,7 +91,6 @@ template <index_t NumDimG,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
index_t CShuffleBlockTransferScalarPerVector_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
MaskingSpecialization MaskingSpec, MaskingSpecialization MaskingSpec,
ck::index_t NumPrefetch = 1,
ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::LoopScheduler LoopSched = make_default_loop_scheduler(),
ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1> ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
...@@ -147,7 +147,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle ...@@ -147,7 +147,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true; static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true;
static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true; static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true;
static constexpr auto AEnableLds_manu = true; static constexpr auto AEnableLds_manu = false;
static constexpr auto B0EnableLds_manu = true; static constexpr auto B0EnableLds_manu = true;
static constexpr auto B1EnableLds_manu = true; static constexpr auto B1EnableLds_manu = true;
......
...@@ -714,13 +714,8 @@ struct GridwiseGemmMultipleD_Wmma ...@@ -714,13 +714,8 @@ struct GridwiseGemmMultipleD_Wmma
const auto MBlock = M / MPerBlock; const auto MBlock = M / MPerBlock;
const auto NBlock = N / NPerBlock; const auto NBlock = N / NPerBlock;
const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
e_grid_desc_m_n, e_grid_desc_m_n,
make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
return e_grid_desc_mblock_mperblock_nblock_nperblock; return e_grid_desc_mblock_mperblock_nblock_nperblock;
} }
......
...@@ -1407,32 +1407,32 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow ...@@ -1407,32 +1407,32 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
if constexpr(IntraRowSwizzlePerm) if constexpr(IntraRowSwizzlePerm)
{ {
temp = __builtin_amdgcn_permlane16( temp = __builtin_amdgcn_permlane16(
temp, type_convert<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0); temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
v_this_row = type_convert<SrcData>(temp); v_this_row = type_convert_sp<SrcData>(temp);
} }
// apply inter-row permute. // apply inter-row permute.
temp = __builtin_amdgcn_permlanex16(temp, temp = __builtin_amdgcn_permlanex16(temp,
type_convert<int>(v_this_row), type_convert_sp<int>(v_this_row),
LowEightRowlaneIdx, LowEightRowlaneIdx,
HighEightRowLaneIdx, HighEightRowLaneIdx,
1, 1,
0); 0);
v_theother_row = type_convert<SrcData>(temp); v_theother_row = type_convert_sp<SrcData>(temp);
if(get_thread_local_1d_id() % 32 < 16) if(get_thread_local_1d_id() % 32 < 16)
{ {
// apply type convert // apply type convert
dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v_this_row); dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset + DstScalarPerVector>{}) = dst_buf(Number<dst_offset + DstScalarPerVector>{}) =
type_convert<DstData>(v_theother_row); type_convert_sp<DstData>(v_theother_row);
} }
else else
{ {
// apply type convert // apply type convert
dst_buf(Number<dst_offset + DstScalarPerVector>{}) = dst_buf(Number<dst_offset + DstScalarPerVector>{}) =
type_convert<DstData>(v_this_row); type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v_theother_row); dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
} }
}); });
}); });
......
...@@ -964,8 +964,17 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t ...@@ -964,8 +964,17 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t
return u.fp32; return u.fp32;
} }
// Convert X to Y
template <typename Y, typename X>
__host__ __device__ constexpr Y type_convert_sp(X x)
{
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
return static_cast<Y>(x);
}
template <> template <>
inline __host__ __device__ constexpr int type_convert<int, float>(float x) inline __host__ __device__ constexpr int type_convert_sp<int, float>(float x)
{ {
union union
{ {
...@@ -977,7 +986,7 @@ inline __host__ __device__ constexpr int type_convert<int, float>(float x) ...@@ -977,7 +986,7 @@ inline __host__ __device__ constexpr int type_convert<int, float>(float x)
} }
template <> template <>
inline __host__ __device__ constexpr float type_convert<float, int>(int x) inline __host__ __device__ constexpr float type_convert_sp<float, int>(int x)
{ {
union union
{ {
...@@ -989,7 +998,7 @@ inline __host__ __device__ constexpr float type_convert<float, int>(int x) ...@@ -989,7 +998,7 @@ inline __host__ __device__ constexpr float type_convert<float, int>(int x)
} }
template <> template <>
inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x) inline __host__ __device__ constexpr int type_convert_sp<int, half_t>(half_t x)
{ {
union union
{ {
...@@ -1001,7 +1010,7 @@ inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x) ...@@ -1001,7 +1010,7 @@ inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x)
} }
template <> template <>
inline __host__ __device__ constexpr half_t type_convert<half_t, int>(int x) inline __host__ __device__ constexpr half_t type_convert_sp<half_t, int>(int x)
{ {
union union
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment