Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ca567c60
Commit
ca567c60
authored
Feb 07, 2025
by
Andriy Roshchenko
Browse files
Print A scales to thread mapping
parent
8a50f4dd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
78 additions
and
12 deletions
+78
-12
test/mx_mfma_op/scale_mapping_32x32x64.cpp
test/mx_mfma_op/scale_mapping_32x32x64.cpp
+78
-12
No files found.
test/mx_mfma_op/scale_mapping_32x32x64.cpp
View file @
ca567c60
...
@@ -12,7 +12,7 @@ __host__ __device__ constexpr Y bit_cast(const X& x)
...
@@ -12,7 +12,7 @@ __host__ __device__ constexpr Y bit_cast(const X& x)
return
__builtin_bit_cast
(
Y
,
x
);
return
__builtin_bit_cast
(
Y
,
x
);
}
}
__global__
void
kernel
()
__global__
void
kernel
_a_scale_mapping
()
{
{
using
dataAB
=
uint8_t
__attribute__
((
ext_vector_type
(
32
)));
using
dataAB
=
uint8_t
__attribute__
((
ext_vector_type
(
32
)));
using
dataC
=
float
__attribute__
((
ext_vector_type
(
16
)));
using
dataC
=
float
__attribute__
((
ext_vector_type
(
16
)));
...
@@ -44,16 +44,18 @@ __global__ void kernel()
...
@@ -44,16 +44,18 @@ __global__ void kernel()
dataX
xb
(
0
);
dataX
xb
(
0
);
#endif
#endif
for
(
int
rowId
=
1
;
rowId
<
2
;
rowId
++
)
// fill first column of B with 1.0
if
(
threadIdx
.
x
==
0
||
threadIdx
.
x
==
32
)
{
{
if
(
threadIdx
.
x
==
0
||
threadIdx
.
x
==
32
)
for
(
size_t
i
=
0
;
i
<
32
;
i
++
)
{
{
for
(
size_t
i
=
0
;
i
<
32
;
i
++
)
regB
[
i
]
=
0x38
;
// 1.0
{
regB
[
i
]
=
0x38
;
// 1.0
}
}
}
}
// verify scale mapping for each row
for
(
int
rowId
=
0
;
rowId
<
32
;
rowId
++
)
{
for
(
int
testId
=
0
;
testId
<
64
;
testId
++
)
for
(
int
testId
=
0
;
testId
<
64
;
testId
++
)
{
{
if
(
threadIdx
.
x
==
0
&&
false
)
if
(
threadIdx
.
x
==
0
&&
false
)
...
@@ -166,12 +168,60 @@ __global__ void kernel()
...
@@ -166,12 +168,60 @@ __global__ void kernel()
printf
(
printf
(
"thread: %u -- xB: %x
\n
"
,
threadIdx
.
x
,
bit_cast
<
int32_t
>
(
xb
[
threadIdx
.
x
/
32
]));
"thread: %u -- xB: %x
\n
"
,
threadIdx
.
x
,
bit_cast
<
int32_t
>
(
xb
[
threadIdx
.
x
/
32
]));
}
}
// Size | BLOCK_N | BLOCK_N |
if
(
threadIdx
.
x
==
0
)
// N | 0 ... 31 | 0 ... 31 |
// Thread Id | 0 ... 31 | 32 ... 63 | Vector
// Register Element ------------ ------------- Element
// Reg0 | M0 | M4 | v[0]
// Reg1 | M1 | M5 | v[1]
// Reg2 | M2 | M6 | v[2]
// Reg3 | M3 | M7 | v[3]
// ____________ _____________
// Reg4 | M8 | M12 | v[4]
// Reg5 | M9 | M13 | v[5]
// Reg6 | M10 | M14 | v[6]
// Reg7 | M11 | M15 | v[7]
// ____________ _____________
// Reg8 | M16 | M20 | v[8]
// Reg9 | M17 | M21 | v[9]
// Reg10 | M18 | M22 | v[10]
// Reg11 | M19 | M23 | v[11]
// ____________ _____________
// Reg12 | M24 | M28 | v[12]
// Reg13 | M25 | M29 | v[13]
// Reg14 | M26 | M30 | v[14]
// Reg15 | M27 | M31 | v[15]
if
(
threadIdx
.
x
==
0
||
threadIdx
.
x
==
32
)
{
{
printf
(
"a(%d,%d) is scaled from thread %f
\n
"
,
rowId
,
testId
,
log2f
(
regC
[
rowId
]));
auto
majChunkId
=
rowId
/
8
;
//{0,1,2,3}
auto
minChunkId
=
rowId
%
8
;
//{0,1,2,3,4,5,6,7}
if
(
minChunkId
<
4
&&
threadIdx
.
x
==
0
)
{
printf
(
"a(%d,%d) is scaled from thread %f
\n
"
,
rowId
,
testId
,
log2f
(
regC
[
4
*
majChunkId
+
minChunkId
]));
// printf("ax(%.0f)*a(%d,%d) ",
// log2f(regC[4 * majChunkId + minChunkId]),
// rowId,
// testId);
}
else
if
(
minChunkId
>=
4
&&
threadIdx
.
x
==
32
)
{
// printf("ax(%.0f)*a(%d,%d) ",
// log2f(regC[4 * majChunkId + minChunkId - 4]),
// rowId,
// testId);
printf
(
"a(%d,%d) is scaled from thread %f
\n
"
,
rowId
,
testId
,
log2f
(
regC
[
4
*
majChunkId
+
minChunkId
-
4
]));
}
}
}
#if
1
#if
0
printf("thread: %u -- regC: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
printf("thread: %u -- regC: %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",
threadIdx.x,
threadIdx.x,
regC[0],
regC[0],
...
@@ -192,11 +242,27 @@ __global__ void kernel()
...
@@ -192,11 +242,27 @@ __global__ void kernel()
regC[15]);
regC[15]);
#endif
#endif
}
}
if
(
threadIdx
.
x
==
32
)
{
printf
(
"
\n
"
);
}
}
}
}
}
__global__
void
kernel_b_scale_mapping
()
{
using
dataAB
=
uint8_t
__attribute__
((
ext_vector_type
(
32
)));
using
dataC
=
float
__attribute__
((
ext_vector_type
(
16
)));
using
dataX
=
int32_t
__attribute__
((
ext_vector_type
(
2
)));
dataAB
regA
(
0
);
dataAB
regB
(
0
);
dataC
regC
(
0.0
f
);
}
int
main
()
int
main
()
{
{
kernel
<<<
1
,
64
>>>
();
kernel_a_scale_mapping
<<<
1
,
64
>>>
();
kernel_b_scale_mapping
<<<
1
,
64
>>>
();
return
0
;
return
0
;
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment