Unverified Commit 394dbf83 authored by Haocong WANG's avatar Haocong WANG Committed by GitHub
Browse files

fix layernorm, reduction Ops (#4)



* [Navi3x] Fix Gridwise_multiple_d operation (#649)

* Add CMake Option "USE_OPT_NAVI3X"

* fix bug

* standardize docs (#655)

* Separate bibtex requirement from rocm-docs-core (#656)

* separate bibtex requirement from rocm-docs-core

* point requirements to source rocm-docs-core repo

* Add CMake Option "USE_OPT_NAVI3X" (#647)

* Add CMake Option "USE_OPT_NAVI3X"

* remove navi3x opt compile option from cmake script

* Conv + quantization + tanh  (#645)

* Rename file. Prepare to support another activation

* Add comment for quantization

* Extract out_elementop

* Add tanh example

* Add conv + bias + tanh quantization instance

* Add missing parameter

* Refine cmake

* Add external api and client example

* Extract variable in example

* Fix the comment

---------
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>

* Add a denorm test fix (#603)

* Add type_convert implementations for bf16

* Add the fix for conv_fwd

* Add the fix for conv_bwd_data

* Add the fix for conv_bwd_weight

* Format

* Format

* Another format

* Add a macro to use workaround on MI200 only

* Format

---------
Co-authored-by: default avatarRosty Geyyer <rosty.geyyer@amd.com>
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>

* simplify karg in device/grid of split-k op (#644)

* simplify karg in device/grid split-k op

* fix mk_kn_mn instances

* add more instances

* use name from tensor layout

* fix 3rd dword of buffer source descriptor (#659)

* add fp64 instances (#658)
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>

* Issue #666: Revert "simplify karg in device/grid of split-k op (#644)" (#665)

This reverts commit bb5530af

.

* Groupnorm + swish external api (#668)

* Rename to proper naming

* Add example of groupnorm + swish

* Extract duplicate code in example

* Add groupnorm + swish instances

* Ractor instance generation, split into multiple cpp file

* Add external api and client example

* Refine profiler message

* Use ck math version of exp

* Refine problem size in example

* Add host version of exp

* add a marco to turn on/off denorm fix (off by default) (#673)

* add a marco to turn off denorm fix by default

* expose the marco

---------
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>

* fixed quant example (#672)
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>

* Add dependabot config and pin rocm-docs-core (#663)

* [gtest] suppress unsafe buffer warn (#670)

ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912



* Add memory index guard in wmma device ops (#667)

* Add more macros to turn on/off denorm fix (#678)
Co-authored-by: default avatarRosty Geyyer <rosty.geyyer@amd.com>

* Fix a typo (#676)

* Add (#677)

* Allow using ROCm release candidate compilers. (#679)

* enable use of rocm5.5 release candidate 4

* upgrade to ROCM5.5 RC5

* try fix the PUB_KEY error, remove the cmake-data package

* upgrade to latest cmake version

* use private dockerhub repo for rocm5.5 rc5

* add missing bracket

* Disable SkipLDS & Align AIT api

* Update dependabot config (#682)
Co-authored-by: default avatarsamjwu <samjwu@users.noreply.github.com>

* update attn api

* solve type_convert bug + enable

---------
Co-authored-by: default avatarSam Wu <sjwu@ualberta.ca>
Co-authored-by: default avatarSam Wu <sam.wu2@amd.com>
Co-authored-by: default avatarrocking5566 <ChunYu.Lai@amd.com>
Co-authored-by: default avatarzjing14 <zhangjing14@gmail.com>
Co-authored-by: default avatarRostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Co-authored-by: default avatarRosty Geyyer <rosty.geyyer@amd.com>
Co-authored-by: default avatarcarlushuang <carlus.huang@amd.com>
Co-authored-by: default avatarroot <root@ctr-ubbsmc15.amd.com>
Co-authored-by: default avatarJun Liu <Liu.Jun@amd.com>
Co-authored-by: default avatarIllia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: default avatarsamjwu <samjwu@users.noreply.github.com>
Co-authored-by: default avatarhaocwang <Haocong.WANG@amd.com>
parent a0058be6
......@@ -6,7 +6,7 @@
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
directory: "/docs/.sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
......@@ -19,7 +19,7 @@ using IndexDataType = int32_t;
using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1
#if 0
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
......
......@@ -77,12 +77,12 @@ using DeviceGemmInstance =
ADataType,
B0DataType,
B1DataType,
CDataType,
Acc0BiasDataType,
Acc0DataType,
Acc1BiasDataType,
Acc1DataType,
CShuffleDataType,
CDataType,
AElementOp,
B0ElementOp,
Acc0ElementOp,
......@@ -93,6 +93,7 @@ using DeviceGemmInstance =
TensorSpecB0,
TensorSpecB1,
TensorSpecC,
1,
256,
// Gemm 0
128, // MPerBlock
......
......@@ -70,10 +70,10 @@
// TODO: enable buffer load when found correct 3rd dword
// buffer load
#define CK_USE_AMD_BUFFER_LOAD 0
#define CK_USE_AMD_BUFFER_LOAD 1
// buffer store
#define CK_USE_AMD_BUFFER_STORE 0
#define CK_USE_AMD_BUFFER_STORE 1
// buffer atomic add: integer
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
......
......@@ -34,12 +34,12 @@ template <index_t NumDimG,
typename ADataType,
typename B0DataType,
typename B1DataType,
typename CDataType,
typename Acc0BiasDataType,
typename Acc0DataType,
typename Acc1BiasDataType,
typename Acc1DataType,
typename CShuffleDataType,
typename CDataType,
typename AElementwiseOperation,
typename B0ElementwiseOperation,
typename AccElementwiseOperation,
......@@ -50,6 +50,7 @@ template <index_t NumDimG,
TensorSpecialization B0Spec,
TensorSpecialization B1Spec,
TensorSpecialization CSpec,
ck::index_t NumPrefetch,
ck::index_t BlockSize,
ck::index_t MPerBlock,
ck::index_t LPerBlock,
......@@ -90,7 +91,6 @@ template <index_t NumDimG,
typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
MaskingSpecialization MaskingSpec,
ck::index_t NumPrefetch = 1,
ck::LoopScheduler LoopSched = make_default_loop_scheduler(),
ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
......@@ -147,7 +147,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true;
static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true;
static constexpr auto AEnableLds_manu = true;
static constexpr auto AEnableLds_manu = false;
static constexpr auto B0EnableLds_manu = true;
static constexpr auto B1EnableLds_manu = true;
......
......@@ -714,13 +714,8 @@ struct GridwiseGemmMultipleD_Wmma
const auto MBlock = M / MPerBlock;
const auto NBlock = N / NPerBlock;
const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
e_grid_desc_m_n,
make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
return e_grid_desc_mblock_mperblock_nblock_nperblock;
}
......
......@@ -1407,32 +1407,32 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
if constexpr(IntraRowSwizzlePerm)
{
temp = __builtin_amdgcn_permlane16(
temp, type_convert<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
v_this_row = type_convert<SrcData>(temp);
temp, type_convert_sp<int>(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
v_this_row = type_convert_sp<SrcData>(temp);
}
// apply inter-row permute.
temp = __builtin_amdgcn_permlanex16(temp,
type_convert<int>(v_this_row),
type_convert_sp<int>(v_this_row),
LowEightRowlaneIdx,
HighEightRowLaneIdx,
1,
0);
v_theother_row = type_convert<SrcData>(temp);
v_theother_row = type_convert_sp<SrcData>(temp);
if(get_thread_local_1d_id() % 32 < 16)
{
// apply type convert
dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset + DstScalarPerVector>{}) =
type_convert<DstData>(v_theother_row);
type_convert_sp<DstData>(v_theother_row);
}
else
{
// apply type convert
dst_buf(Number<dst_offset + DstScalarPerVector>{}) =
type_convert<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v_theother_row);
type_convert_sp<DstData>(v_this_row);
dst_buf(Number<dst_offset>{}) = type_convert_sp<DstData>(v_theother_row);
}
});
});
......
......@@ -964,8 +964,17 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t
return u.fp32;
}
// Convert X to Y
template <typename Y, typename X>
__host__ __device__ constexpr Y type_convert_sp(X x)
{
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
return static_cast<Y>(x);
}
template <>
inline __host__ __device__ constexpr int type_convert<int, float>(float x)
inline __host__ __device__ constexpr int type_convert_sp<int, float>(float x)
{
union
{
......@@ -977,7 +986,7 @@ inline __host__ __device__ constexpr int type_convert<int, float>(float x)
}
template <>
inline __host__ __device__ constexpr float type_convert<float, int>(int x)
inline __host__ __device__ constexpr float type_convert_sp<float, int>(int x)
{
union
{
......@@ -989,7 +998,7 @@ inline __host__ __device__ constexpr float type_convert<float, int>(int x)
}
template <>
inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x)
inline __host__ __device__ constexpr int type_convert_sp<int, half_t>(half_t x)
{
union
{
......@@ -1001,7 +1010,7 @@ inline __host__ __device__ constexpr int type_convert<int, half_t>(half_t x)
}
template <>
inline __host__ __device__ constexpr half_t type_convert<half_t, int>(int x)
inline __host__ __device__ constexpr half_t type_convert_sp<half_t, int>(int x)
{
union
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment