Unverified Commit 394dbf83 authored by Haocong WANG's avatar Haocong WANG Committed by GitHub
Browse files

fix layernorm, reduction Ops (#4)



* [Navi3x] Fix Gridwise_multiple_d operation (#649)

* Add CMake Option "USE_OPT_NAVI3X"

* fix bug

* standardize docs (#655)

* Separate bibtex requirement from rocm-docs-core (#656)

* separate bibtex requirement from rocm-docs-core

* point requirements to source rocm-docs-core repo

* Add CMake Option "USE_OPT_NAVI3X" (#647)

* Add CMake Option "USE_OPT_NAVI3X"

* remove navi3x opt compile option from cmake script

* Conv + quantization + tanh  (#645)

* Rename file. Prepare to support another activation

* Add comment for quantization

* Extract out_elementop

* Add tanh example

* Add conv + bias + tanh quantization instance

* Add missing parameter

* Refine cmake

* Add external api and client example

* Extract variable in example

* Fix the comment

---------
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>

* Add a denorm test fix (#603)

* Add type_convert implementations for bf16

* Add the fix for conv_fwd

* Add the fix for conv_bwd_data

* Add the fix for conv_bwd_weight

* Format

* Format

* Another format

* Add a macro to use workaround on MI200 only

* Format

---------
Co-authored-by: default avatarRosty Geyyer <rosty.geyyer@amd.com>
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>

* simplify karg in device/grid of split-k op (#644)

* simplify karg in device/grid split-k op

* fix mk_kn_mn instances

* add more instances

* use name from tensor layout

* fix 3rd dword of buffer source descriptor (#659)

* add fp64 instances (#658)
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>

* Issue #666: Revert "simplify karg in device/grid of split-k op (#644)" (#665)

This reverts commit bb5530af

.

* Groupnorm + swish external api (#668)

* Rename to proper naming

* Add example of groupnorm + swish

* Extract duplicate code in example

* Add groupnorm + swish instances

* Ractor instance generation, split into multiple cpp file

* Add external api and client example

* Refine profiler message

* Use ck math version of exp

* Refine problem size in example

* Add host version of exp

* add a marco to turn on/off denorm fix (off by default) (#673)

* add a marco to turn off denorm fix by default

* expose the marco

---------
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>

* fixed quant example (#672)
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>

* Add dependabot config and pin rocm-docs-core (#663)

* [gtest] suppress unsafe buffer warn (#670)

ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912



* Add memory index guard in wmma device ops (#667)

* Add more macros to turn on/off denorm fix (#678)
Co-authored-by: default avatarRosty Geyyer <rosty.geyyer@amd.com>

* Fix a typo (#676)

* Add (#677)

* Allow using ROCm release candidate compilers. (#679)

* enable use of rocm5.5 release candidate 4

* upgrade to ROCM5.5 RC5

* try fix the PUB_KEY error, remove the cmake-data package

* upgrade to latest cmake version

* use private dockerhub repo for rocm5.5 rc5

* add missing bracket

* Disable SkipLDS & Align AIT api

* Update dependabot config (#682)
Co-authored-by: default avatarsamjwu <samjwu@users.noreply.github.com>

* update attn api

* solve type_convert bug + enable

---------
Co-authored-by: default avatarSam Wu <sjwu@ualberta.ca>
Co-authored-by: default avatarSam Wu <sam.wu2@amd.com>
Co-authored-by: default avatarrocking5566 <ChunYu.Lai@amd.com>
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>
Co-authored-by: default avatarRostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Co-authored-by: default avatarRosty Geyyer <rosty.geyyer@amd.com>
Co-authored-by: default avatarcarlushuang <carlus.huang@amd.com>
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>
Co-authored-by: default avatarJun Liu <Liu.Jun@amd.com>
Co-authored-by: default avatarIllia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: default avatarsamjwu <samjwu@users.noreply.github.com>
Co-authored-by: default avatarhaocwang <Haocong.WANG@amd.com>
parent a0058be6
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
version: 2 version: 2
updates: updates:
- package-ecosystem: "pip" # See documentation for possible values - package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests directory: "/docs/.sphinx" # Location of package manifests
open-pull-requests-limit: 10 open-pull-requests-limit: 10
schedule: schedule:
interval: "daily" interval: "daily"
...@@ -19,7 +19,7 @@ using IndexDataType = int32_t; ...@@ -19,7 +19,7 @@ using IndexDataType = int32_t;
using InLayout = ck::tensor_layout::convolution::NHWC; using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC; using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1 #if 0
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else #else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
......
...@@ -77,12 +77,12 @@ using DeviceGemmInstance = ...@@ -77,12 +77,12 @@ using DeviceGemmInstance =
ADataType, ADataType,
B0DataType, B0DataType,
B1DataType, B1DataType,
CDataType,
Acc0BiasDataType, Acc0BiasDataType,
Acc0DataType, Acc0DataType,
Acc1BiasDataType, Acc1BiasDataType,
Acc1DataType, Acc1DataType,
CShuffleDataType, CShuffleDataType,
CDataType,
AElementOp, AElementOp,
B0ElementOp, B0ElementOp,
Acc0ElementOp, Acc0ElementOp,
...@@ -93,6 +93,7 @@ using DeviceGemmInstance = ...@@ -93,6 +93,7 @@ using DeviceGemmInstance =
TensorSpecB0, TensorSpecB0,
TensorSpecB1, TensorSpecB1,
TensorSpecC, TensorSpecC,
1,
256, 256,
// Gemm 0 // Gemm 0
128, // MPerBlock 128, // MPerBlock
......
...@@ -70,10 +70,10 @@ ...@@ -70,10 +70,10 @@
// TODO: enable buffer load when found correct 3rd dword // TODO: enable buffer load when found correct 3rd dword
// buffer load // buffer load
#define CK_USE_AMD_BUFFER_LOAD 0 #define CK_USE_AMD_BUFFER_LOAD 1
// buffer store // buffer store
#define CK_USE_AMD_BUFFER_STORE 0 #define CK_USE_AMD_BUFFER_STORE 1
// buffer atomic add: integer // buffer atomic add: integer
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
......
...@@ -34,12 +34,12 @@ template <index_t NumDimG, ...@@ -34,12 +34,12 @@ template <index_t NumDimG,
typename ADataType, typename ADataType,
typename B0DataType, typename B0DataType,
typename B1DataType, typename B1DataType,
typename CDataType,
typename Acc0BiasDataType, typename Acc0BiasDataType,
typename Acc0DataType, typename Acc0DataType,
typename Acc1BiasDataType, typename Acc1BiasDataType,
typename Acc1DataType, typename Acc1DataType,
typename CShuffleDataType, typename CShuffleDataType,
typename CDataType,
typename AElementwiseOperation, typename AElementwiseOperation,
typename B0ElementwiseOperation, typename B0ElementwiseOperation,
typename AccElementwiseOperation, typename AccElementwiseOperation,
...@@ -50,6 +50,7 @@ template <index_t NumDimG, ...@@ -50,6 +50,7 @@ template <index_t NumDimG,
TensorSpecialization B0Spec, TensorSpecialization B0Spec,
TensorSpecialization B1Spec, TensorSpecialization B1Spec,
TensorSpecialization CSpec, TensorSpecialization CSpec,
ck::index_t NumPrefetch,
ck::index_t BlockSize, ck::index_t BlockSize,
ck::index_t MPerBlock, ck::index_t MPerBlock,
ck::index_t LPerBlock, ck::index_t LPerBlock,
...@@ -90,7 +91,6 @@ template <index_t NumDimG, ...@@ -90,7 +91,6 @@ template <index_t NumDimG,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
index_t CShuffleBlockTransferScalarPerVector_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
MaskingSpecialization MaskingSpec, MaskingSpecialization MaskingSpec,
ck::index_t NumPrefetch = 1,
ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::LoopScheduler LoopSched = make_default_loop_scheduler(),
ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1> ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
...@@ -147,7 +147,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle ...@@ -147,7 +147,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true; static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true;
static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true; static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true;
static constexpr auto AEnableLds_manu = true; static constexpr auto AEnableLds_manu = false;
static constexpr auto B0EnableLds_manu = true; static constexpr auto B0EnableLds_manu = true;
static constexpr auto B1EnableLds_manu = true; static constexpr auto B1EnableLds_manu = true;
......
...@@ -714,13 +714,8 @@ struct GridwiseGemmMultipleD_Wmma ...@@ -714,13 +714,8 @@ struct GridwiseGemmMultipleD_Wmma
const auto MBlock = M / MPerBlock; const auto MBlock = M / MPerBlock;
const auto NBlock = N / NPerBlock; const auto NBlock = N / NPerBlock;
const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
e_grid_desc_m_n, e_grid_desc_m_n,
make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
return e_grid_desc_mblock_mperblock_nblock_nperblock; return e_grid_desc_mblock_mperblock_nblock_nperblock;
} }
......
...@@ -1407,32 +1407,32 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow ...@@ -1407,32 +1407,32 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
if constexpr(IntraRowSwizzlePerm) if constexpr(IntraRowSwizzlePerm)
{ {
temp = __builtin_amdgcn_permlane16( temp = __builtin_amdgcn_permlane16(
temp, type_convert<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0); temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
v_this_row = type_convert<SrcData>(temp); v_this_row = type_convert_sp<SrcData>(temp);
} }
// apply inter-row permute. // apply inter-row permute.
temp = __builtin_amdgcn_permlanex16(temp, temp = __builtin_amdgcn_permlanex16(temp,
type_convert<int>(v_this_row), type_convert_sp<int>(v_this_row),
LowEightRowlaneIdx, LowEightRowlaneIdx,
HighEightRowLaneIdx, HighEightRowLaneIdx,
1, 1,
0); 0);
v_theother_row = type_convert<SrcData>(temp); v_theother_row = type_convert_sp<SrcData>(temp);
if(get_thread_local_1d_id() % 32 < 16) if(get_thread_local_1d_id() % 32 < 16)
{ {
// apply type convert // apply type convert
dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v_this_row); dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset + DstScalarPerVector>{}) = dst_buf(Number<dst_offset + DstScalarPerVector>{}) =
type_convert<DstData>(v_theother_row); type_convert_sp<DstData>(v_theother_row);
} }
else else
{ {
// apply type convert // apply type convert
dst_buf(Number<dst_offset + DstScalarPerVector>{}) = dst_buf(Number<dst_offset + DstScalarPerVector>{}) =
type_convert<DstData>(v_this_row); type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v_theother_row); dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
} }
}); });
}); });
......
...@@ -964,8 +964,17 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t ...@@ -964,8 +964,17 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t
return u.fp32; return u.fp32;
} }
// Convert X to Y
template <typename Y, typename X>
__host__ __device__ constexpr Y type_convert_sp(X x)
{
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
return static_cast<Y>(x);
}
template <> template <>
inline __host__ __device__ constexpr int type_convert<int, float>(float x) inline __host__ __device__ constexpr int type_convert_sp<int, float>(float x)
{ {
union union
{ {
...@@ -977,7 +986,7 @@ inline __host__ __device__ constexpr int type_convert<int, float>(float x) ...@@ -977,7 +986,7 @@ inline __host__ __device__ constexpr int type_convert<int, float>(float x)
} }
template <> template <>
inline __host__ __device__ constexpr float type_convert<float, int>(int x) inline __host__ __device__ constexpr float type_convert_sp<float, int>(int x)
{ {
union union
{ {
...@@ -989,7 +998,7 @@ inline __host__ __device__ constexpr float type_convert<float, int>(int x) ...@@ -989,7 +998,7 @@ inline __host__ __device__ constexpr float type_convert<float, int>(int x)
} }
template <> template <>
inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x) inline __host__ __device__ constexpr int type_convert_sp<int, half_t>(half_t x)
{ {
union union
{ {
...@@ -1001,7 +1010,7 @@ inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x) ...@@ -1001,7 +1010,7 @@ inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x)
} }
template <> template <>
inline __host__ __device__ constexpr half_t type_convert<half_t, int>(int x) inline __host__ __device__ constexpr half_t type_convert_sp<half_t, int>(int x)
{ {
union union
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment