Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
f3af1da6
Commit
f3af1da6
authored
Feb 05, 2025
by
Andriy Roshchenko
Browse files
Merge remote-tracking branch 'internal/andriy/lwpck-2788' into andriy/lwpck-2788
parents
2bef5501
60b885ae
Changes
8
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1343 additions
and
14 deletions
+1343
-14
CMakeLists.txt
CMakeLists.txt
+1
-1
CMakePresets.json
CMakePresets.json
+189
-0
include/ck/library/utility/host_tensor_generator.hpp
include/ck/library/utility/host_tensor_generator.hpp
+15
-0
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+15
-6
include/ck/utility/amd_xdlops.hpp
include/ck/utility/amd_xdlops.hpp
+26
-2
test/mx_mfma_op/mx_mfma_op.cpp
test/mx_mfma_op/mx_mfma_op.cpp
+98
-3
test/mx_mfma_op/mx_mfma_op.hpp
test/mx_mfma_op/mx_mfma_op.hpp
+816
-2
test/mx_mfma_op/scale_mfma_repro.cpp
test/mx_mfma_op/scale_mfma_repro.cpp
+183
-0
No files found.
CMakeLists.txt
View file @
f3af1da6
...
...
@@ -541,7 +541,7 @@ endif()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
if
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
MATCHES
"Clang"
)
add_compile_options
(
-fcolor-diagnostics
)
#
add_compile_options(-fcolor-diagnostics)
endif
()
if
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
STREQUAL
"GNU"
AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9
)
add_compile_options
(
-fdiagnostics-color=always
)
...
...
CMakePresets.json
0 → 100644
View file @
f3af1da6
{
"version"
:
3
,
"configurePresets"
:
[
{
"name"
:
"linux-debug"
,
"displayName"
:
"Linux Debug"
,
"hidden"
:
true
,
"generator"
:
"Unix Makefiles"
,
"binaryDir"
:
"${sourceDir}/build/${presetName}"
,
"installDir"
:
"${sourceDir}/build/install/${presetName}"
,
"environment"
:
{
"MY_ENVIRONMENT_VARIABLE"
:
"NONE"
,
"PATH"
:
"/usr/local/.cargo/bin:$penv{PATH}"
,
"SCCACHE_IDLE_TIMEOUT"
:
"11000"
},
"cacheVariables"
:
{
"CMAKE_BUILD_TYPE"
:
"Debug"
,
"CMAKE_EXPORT_COMPILE_COMMANDS"
:
"ON"
,
"BUILD_DEV"
:
"ON"
,
"CMAKE_CXX_COMPILER"
:
"/opt/rocm/bin/hipcc"
,
"CMAKE_PREFIX_PATH"
:
"/opt/rocm"
,
"CMAKE_CXX_COMPILER_LAUNCHER"
:
"sccache"
,
"CMAKE_C_COMPILER_LAUNCHER"
:
"sccache"
},
"condition"
:
{
"type"
:
"equals"
,
"lhs"
:
"${hostSystemName}"
,
"rhs"
:
"Linux"
}
},
{
"name"
:
"MI355-debug"
,
"displayName"
:
"MI355 Debug"
,
"inherits"
:
"linux-debug"
,
"description"
:
"Development Environment for MI355."
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx950"
,
"CMAKE_BUILD_TYPE"
:
"Debug"
,
"CMAKE_CXX_FLAGS"
:
"-O0 -ggdb"
}
},
{
"name"
:
"MI355-release"
,
"displayName"
:
"MI355 Release"
,
"inherits"
:
"linux-debug"
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx950"
,
"CMAKE_BUILD_TYPE"
:
"Release"
,
"CMAKE_CXX_FLAGS"
:
"-O3"
}
},
{
"name"
:
"MI300X-release"
,
"displayName"
:
"MI300X Release"
,
"inherits"
:
"linux-debug"
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx942"
,
"CMAKE_BUILD_TYPE"
:
"Release"
,
"CMAKE_CXX_FLAGS"
:
"-O3"
}
},
{
"name"
:
"MI250-release"
,
"displayName"
:
"MI250 Release"
,
"inherits"
:
"linux-debug"
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx90a"
,
"CMAKE_BUILD_TYPE"
:
"Release"
,
"CMAKE_CXX_FLAGS"
:
"-O3"
,
"CK_USE_FP8_ON_UNSUPPORTED_ARCH"
:
"ON"
}
},
{
"name"
:
"MI250-debug"
,
"displayName"
:
"MI250 Debug"
,
"inherits"
:
"linux-debug"
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx90a"
,
"CMAKE_BUILD_TYPE"
:
"Debug"
,
"CMAKE_CXX_FLAGS"
:
"-O0 -ggdb"
,
"CK_USE_FP8_ON_UNSUPPORTED_ARCH"
:
"ON"
}
},
{
"name"
:
"RX7800-release"
,
"displayName"
:
"RX7800 Release"
,
"inherits"
:
"linux-debug"
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx1101"
,
"DL_KERNELS"
:
"ON"
,
"CMAKE_BUILD_TYPE"
:
"Release"
,
"CMAKE_CXX_FLAGS"
:
"-O3"
}
},
{
"name"
:
"RX7800-debug"
,
"displayName"
:
"RX7800 Debug"
,
"inherits"
:
"linux-debug"
,
"cacheVariables"
:
{
"GPU_TARGETS"
:
"gfx1101"
,
"DL_KERNELS"
:
"ON"
,
"CMAKE_BUILD_TYPE"
:
"Debug"
,
"CMAKE_CXX_FLAGS"
:
"-O0 -ggdb"
}
}
],
"buildPresets"
:
[
{
"name"
:
"Debug"
,
"hidden"
:
true
,
"configuration"
:
"Debug"
},
{
"name"
:
"Release"
,
"hidden"
:
true
,
"configuration"
:
"Release"
},
{
"name"
:
"MI355-debug"
,
"displayName"
:
"MI355"
,
"configurePreset"
:
"MI355-debug"
,
"description"
:
"Build Environment for MI355 Debug."
,
"inherits"
:
[
"Debug"
],
"jobs"
:
128
},
{
"name"
:
"MI355-release"
,
"displayName"
:
"MI355"
,
"configurePreset"
:
"MI355-release"
,
"description"
:
"Build Environment for MI355 Release."
,
"inherits"
:
[
"Release"
],
"jobs"
:
128
},
{
"name"
:
"MI300X-release"
,
"displayName"
:
"MI300X"
,
"configurePreset"
:
"MI300X-release"
,
"description"
:
"Build Environment for MI300X Release."
,
"inherits"
:
[
"Release"
],
"jobs"
:
128
},
{
"name"
:
"MI250-release"
,
"displayName"
:
"MI250"
,
"configurePreset"
:
"MI250-release"
,
"description"
:
"Build Environment for MI250 Release."
,
"inherits"
:
[
"Release"
],
"jobs"
:
128
},
{
"name"
:
"MI250-debug"
,
"displayName"
:
"MI250"
,
"configurePreset"
:
"MI250-debug"
,
"description"
:
"Build Environment for MI250 Debug."
,
"inherits"
:
[
"Debug"
],
"jobs"
:
128
},
{
"name"
:
"RX7800-release"
,
"displayName"
:
"RX7800"
,
"configurePreset"
:
"RX7800-release"
,
"description"
:
"Build Environment for RX7800 Release."
,
"inherits"
:
[
"Release"
],
"jobs"
:
128
},
{
"name"
:
"RX7800-debug"
,
"displayName"
:
"RX7800"
,
"configurePreset"
:
"RX7800-debug"
,
"description"
:
"Build Environment for RX7800 Debug."
,
"inherits"
:
[
"Debug"
],
"jobs"
:
128
}
]
}
include/ck/library/utility/host_tensor_generator.hpp
View file @
f3af1da6
...
...
@@ -359,6 +359,21 @@ struct GeneratorTensor_Sequential
}
};
template
<
ck
::
index_t
Dim
>
struct
GeneratorTensor_Sequential
<
ck
::
e8m0_bexp_t
,
Dim
>
{
int
offset
=
0
;
template
<
typename
...
Ts
>
ck
::
e8m0_bexp_t
operator
()(
Ts
...
Xs
)
const
{
std
::
array
<
ck
::
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
static_cast
<
ck
::
index_t
>
(
Xs
)...}};
int
tmp
=
dims
[
Dim
];
return
ck
::
type_convert
<
ck
::
e8m0_bexp_t
>
(
powf
(
2
,
tmp
+
offset
));
}
};
template
<
typename
T
,
size_t
NumEffectiveDim
=
2
>
struct
GeneratorTensor_Diagonal
{
...
...
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
View file @
f3af1da6
...
...
@@ -780,7 +780,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
}
};
// TODO: fix mfma...f8f6f4 instructions
template
<
>
struct
mfma_type
<
MfmaInstr
::
mfma_f32_32x32x64f8f6f4
>
{
...
...
@@ -847,9 +846,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
// clang-format on
template
<
index_t
MPerXdlops
,
index_t
NPerXdlops
,
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
run
(
const
FloatA
&
a
,
const
FloatB
&
b
,
FloatC
&
reg_c
)
const
__device__
void
run
(
const
FloatA
&
a
,
const
int32_t
&
scale_a
,
const
FloatB
&
b
,
const
int32_t
&
scale_b
,
FloatC
&
reg_c
)
const
{
intrin_mfma_scale_f32_32x32x64f8f6f4
<
MPerXdlops
,
NPerXdlops
>::
Run
(
a
,
b
,
reg_c
);
intrin_mfma_scale_f32_32x32x64f8f6f4
<
MPerXdlops
,
NPerXdlops
>::
Run
(
a
,
scale_a
,
b
,
scale_b
,
reg_c
);
}
};
...
...
@@ -871,9 +875,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
// clang-format on
template
<
index_t
MPerXdlops
,
index_t
NPerXdlops
,
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
run
(
const
FloatA
&
a
,
const
FloatB
&
b
,
FloatC
&
reg_c
)
const
{
intrin_mfma_scale_f32_16x16x128f8f6f4
<
MPerXdlops
,
NPerXdlops
>::
Run
(
a
,
b
,
reg_c
);
__device__
void
run
(
const
FloatA
&
a
,
const
int32_t
&
scale_a
,
const
FloatB
&
b
,
const
int32_t
&
scale_b
,
FloatC
&
reg_c
)
const
{
intrin_mfma_scale_f32_16x16x128f8f6f4
<
MPerXdlops
,
NPerXdlops
>::
Run
(
a
,
scale_a
,
b
,
scale_b
,
reg_c
);
}
};
...
...
include/ck/utility/amd_xdlops.hpp
View file @
f3af1da6
...
...
@@ -519,12 +519,36 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
{
template
<
class
FloatC
>
__device__
static
void
Run
(
const
f8x32_t
&
reg_a
,
const
int32_t
scale_a
,
const
int32_t
&
scale_a
,
const
f8x32_t
&
reg_b
,
const
int32_t
scale_b
,
const
int32_t
&
scale_b
,
FloatC
&
reg_c
)
{
#if defined(__gfx950__)
if
(
threadIdx
.
x
==
0
||
threadIdx
.
x
==
32
)
{
printf
(
"thread: %u -- xA: %x
\n
"
,
threadIdx
.
x
,
static_cast
<
uint32_t
>
(
scale_a
));
printf
(
"thread: %u -- xB: %x
\n
"
,
threadIdx
.
x
,
static_cast
<
uint32_t
>
(
scale_b
));
// printf("intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32> thread: %u -- scale_a: %f\n",
// threadIdx.x,
// static_cast<float>(ck::e8m0_bexp_t(scale_a)));
// printf("intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32> thread: %u -- scale_b: %f\n",
// threadIdx.x,
// static_cast<float>(ck::e8m0_bexp_t(scale_b)));
// for(size_t i = 0; i < 32; i++)
// {
// printf("thread: %u -- reg_a[%zu]: %f\n",
// threadIdx.x,
// i,
// type_convert<float>(f8_t{static_cast<f8x32_t::data_v>(reg_a)[i]}));
// // printf("thread: %u -- reg_a[%zu]: %f\n",
// // threadIdx.x,
// // i,
// // type_convert<float>(f8_t{static_cast<f8x32_t::data_v>(reg_b)[i]}));
// }
}
// https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
reg_c
.
template
AsType
<
float16_t
>()(
Number
<
0
>
{})
=
__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
(
...
...
test/mx_mfma_op/mx_mfma_op.cpp
View file @
f3af1da6
...
...
@@ -30,11 +30,11 @@ bool run_mfma_test(ck::index_t init)
constexpr
auto
BLOCK_N
=
mfma_instr
.
n_per_blk
;
constexpr
auto
BLOCK_K
=
mfma_instr
.
num_input_blks
*
mfma_instr
.
k_per_blk
;
const
auto
mx_
mfma_kernel
=
ck
::
matmul
<
AType
,
BType
,
CType
,
AccType
,
BLOCK_M
,
BLOCK_N
,
BLOCK_K
>
;
const
auto
mfma_kernel
=
ck
::
matmul
<
AType
,
BType
,
CType
,
AccType
,
BLOCK_M
,
BLOCK_N
,
BLOCK_K
>
;
bool
pass
=
true
;
pass
=
ck
::
mfma_test
::
TestMFMA
<
decltype
(
mx_
mfma_kernel
),
pass
=
ck
::
mfma_test
::
TestMFMA
<
decltype
(
mfma_kernel
),
AType
,
BType
,
CType
,
...
...
@@ -45,7 +45,7 @@ bool run_mfma_test(ck::index_t init)
CLayout
,
BLOCK_M
,
BLOCK_N
,
BLOCK_K
>
{}(
mx_
mfma_kernel
,
init
);
BLOCK_K
>
{}(
mfma_kernel
,
init
);
return
pass
;
}
...
...
@@ -63,3 +63,98 @@ TEST(MFMA, FP8MFMA32x32x64)
auto
pass
=
run_mfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
F32_32x32x64
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
/**
* @brief Run the test for the given MX MFMA instruction
*
* @param init - selects initialization algorithm for A and B tensors
*/
template
<
typename
AType
,
typename
BType
,
typename
CType
,
ck
::
MFMA_F8F6F4
mfma
>
bool
run_mxmfma_test
(
ck
::
index_t
init
)
{
static_assert
(
mfma
==
ck
::
MFMA_F8F6F4
::
SCALE_F32_16x16x128
||
mfma
==
ck
::
MFMA_F8F6F4
::
SCALE_F32_32x32x64
,
"Only SCALE_F32_16x16x128 and SCALE_F32_32x32x64 are supported"
);
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
AccType
=
float
;
// only MFMA_F32 instructions supported
// using CPUAccType = AccType;
using
ScaleType
=
ck
::
e8m0_bexp_t
;
// biased exponent type
ck
::
mfma_type
<
static_cast
<
ck
::
MfmaInstr
>
(
mfma
)
>
mfma_instr
;
constexpr
auto
BLOCK_M
=
mfma_instr
.
m_per_blk
;
constexpr
auto
BLOCK_N
=
mfma_instr
.
n_per_blk
;
constexpr
auto
BLOCK_K
=
mfma_instr
.
num_input_blks
*
mfma_instr
.
k_per_blk
;
constexpr
auto
BLOCK_X
=
32
;
// scaling vector size
const
auto
mx_mfma_kernel
=
ck
::
matmul
<
AType
,
BType
,
ScaleType
,
CType
,
AccType
,
BLOCK_M
,
BLOCK_N
,
BLOCK_K
,
BLOCK_X
>
;
bool
pass
=
true
;
pass
=
ck
::
mxmfma_test
::
TestMXMFMA
<
decltype
(
mx_mfma_kernel
),
AType
,
BType
,
ScaleType
,
CType
,
ALayout
,
BLayout
,
CLayout
,
BLOCK_M
,
BLOCK_N
,
BLOCK_K
,
BLOCK_X
>
{}(
mx_mfma_kernel
,
init
);
return
pass
;
}
TEST
(
MXMFMA
,
MXFP8MFMA16x16x128i2
)
{
auto
AB_init
=
2
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_16x16x128
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
TEST
(
MXMFMA
,
MXFP8MFMA32x32x64i2
)
{
auto
AB_init
=
2
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_32x32x64
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
TEST
(
MXMFMA
,
MXFP8MFMA16x16x128i3
)
{
auto
AB_init
=
3
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_16x16x128
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
TEST
(
MXMFMA
,
MXFP8MFMA32x32x64i3
)
{
auto
AB_init
=
3
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_32x32x64
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
TEST
(
MXMFMA
,
MXFP8MFMA16x16x128i4
)
{
auto
AB_init
=
4
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_16x16x128
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
TEST
(
MXMFMA
,
MXFP8MFMA32x32x64i4
)
{
auto
AB_init
=
4
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_32x32x64
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
TEST
(
MXMFMA
,
MXFP8MFMA32x32x64i5
)
{
auto
AB_init
=
5
;
auto
pass
=
run_mxmfma_test
<
f8_t
,
f8_t
,
float
,
ck
::
MFMA_F8F6F4
::
SCALE_F32_32x32x64
>
(
AB_init
);
EXPECT_TRUE
(
pass
);
}
test/mx_mfma_op/mx_mfma_op.hpp
View file @
f3af1da6
This diff is collapsed.
Click to expand it.
test/mx_mfma_op/scale_mfma_repro.cpp
0 → 100644
View file @
f3af1da6
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
__global__
void
kernel
()
{
using
dataAB
=
uint8_t
__attribute__
((
ext_vector_type
(
32
)));
using
dataC
=
float
__attribute__
((
ext_vector_type
(
16
)));
using
dataX
=
int32_t
__attribute__
((
ext_vector_type
(
2
)));
dataAB
regA
(
0x38
);
dataAB
regB
(
0x38
);
dataC
regC
(
1.0
f
);
// dataC regCin(1.0f);
#if 1
// dataX xa{127, 127}; // 1.0
dataX
xa
(
127
&
0xFF
);
// 1.0
dataX
xb
(
127
&
0xFF
);
// 1.0
#else
dataX
xa
(
0
);
dataX
xb
(
0
);
#endif
#if 0
if(threadIdx.x == 0)
{
// xa = 127; // 1.0
for(size_t i = 0; i < 32; i++)
{
regA[i] = 0x38; // 1.0
}
for(size_t i = 0; i < 32; i++)
{
regB[i] = 0x38; // 1.0
}
printf("thread: %u -- xA: %x\n", threadIdx.x, xa[threadIdx.x / 32]);
printf("thread: %u -- xB: %x\n", threadIdx.x, xb[threadIdx.x / 32]);
}
if(threadIdx.x == 32)
{
// xa = 126; // 0.5
for(size_t i = 0; i < 32; i++)
{
regA[i] = 0xC0; // -2.0
}
for(size_t i = 0; i < 32; i++)
{
regB[i] = 0x38; // 1.0
}
printf("thread: %u -- xA: %x\n", threadIdx.x, xa[threadIdx.x / 32]);
printf("thread: %u -- xB: %x\n", threadIdx.x, xb[threadIdx.x / 32]);
}
#endif
__syncthreads
();
printf
(
"thread: %u -- regA: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x "
"%x %x %x %x %x %x %x %x %x %x
\n
"
,
threadIdx
.
x
,
regA
[
0
],
regA
[
1
],
regA
[
2
],
regA
[
3
],
regA
[
4
],
regA
[
5
],
regA
[
6
],
regA
[
7
],
regA
[
8
],
regA
[
9
],
regA
[
10
],
regA
[
11
],
regA
[
12
],
regA
[
13
],
regA
[
14
],
regA
[
15
],
regA
[
16
],
regA
[
17
],
regA
[
18
],
regA
[
19
],
regA
[
20
],
regA
[
21
],
regA
[
22
],
regA
[
23
],
regA
[
24
],
regA
[
25
],
regA
[
26
],
regA
[
27
],
regA
[
28
],
regA
[
29
],
regA
[
30
],
regA
[
31
]);
printf
(
"thread: %u -- regB: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x "
"%x %x %x %x %x %x %x %x %x %x
\n
"
,
threadIdx
.
x
,
regB
[
0
],
regB
[
1
],
regB
[
2
],
regB
[
3
],
regB
[
4
],
regB
[
5
],
regB
[
6
],
regB
[
7
],
regB
[
8
],
regB
[
9
],
regB
[
10
],
regB
[
11
],
regB
[
12
],
regB
[
13
],
regB
[
14
],
regB
[
15
],
regB
[
16
],
regB
[
17
],
regB
[
18
],
regB
[
19
],
regB
[
20
],
regB
[
21
],
regB
[
22
],
regB
[
23
],
regB
[
24
],
regB
[
25
],
regB
[
26
],
regB
[
27
],
regB
[
28
],
regB
[
29
],
regB
[
30
],
regB
[
31
]);
//__builtin_amdgcn_mfma_ld_scale_b32(xb[threadIdx.x / 32], 0, 0);
regC
=
__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4
(
regA
,
regB
,
regC
,
0
,
// cbsz
0
,
// blgp
0
,
xa
[
threadIdx
.
x
/
32
],
0
,
xb
[
threadIdx
.
x
/
32
]);
__syncthreads
();
printf
(
"thread: %u -- regC: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f
\n
"
,
threadIdx
.
x
,
regC
[
0
],
regC
[
1
],
regC
[
2
],
regC
[
3
],
regC
[
4
],
regC
[
5
],
regC
[
6
],
regC
[
7
],
regC
[
8
],
regC
[
9
],
regC
[
10
],
regC
[
11
],
regC
[
12
],
regC
[
13
],
regC
[
14
],
regC
[
15
]);
// printf("thread: %u -- regCin: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
// threadIdx.x,
// regCin[0],
// regCin[1],
// regCin[2],
// regCin[3],
// regCin[4],
// regCin[5],
// regCin[6],
// regCin[7],
// regCin[8],
// regCin[9],
// regCin[10],
// regCin[11],
// regCin[12],
// regCin[13],
// regCin[14],
// regCin[15]);
}
int
main
()
{
kernel
<<<
1
,
64
>>>
();
return
0
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment