Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
78fbf32f
Commit
78fbf32f
authored
Jan 14, 2025
by
coderfeli
Browse files
run ok now
parent
747dd16c
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
70 additions
and
102 deletions
+70
-102
example/ck_tile/15_fused_moe/main.cpp
example/ck_tile/15_fused_moe/main.cpp
+14
-37
include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x256_1x4x1_16x16x32_itl.hpp
.../flatmm/block/flatmm_sn_32x128x256_1x4x1_16x16x32_itl.hpp
+41
-55
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x256_1x4x1_16x16x16_itl.inc
...ck/uk/flatmm_sn_uk_gfx9_32x128x256_1x4x1_16x16x16_itl.inc
+10
-5
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
...ck/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
+4
-4
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+1
-1
No files found.
example/ck_tile/15_fused_moe/main.cpp
View file @
78fbf32f
...
...
@@ -11,8 +11,8 @@
template
<
typename
DataType
>
auto
get_elimit
()
{
double
rtol
=
2
e-
2
;
double
atol
=
2
e-
2
;
double
rtol
=
1
e-
1
;
double
atol
=
1
e-
1
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
...
...
@@ -309,24 +309,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
}
else
if
(
init
==
3
)
{
// ck_tile::FillConstant<ADataType>{}(a_host);
// ck_tile::FillStepRange<ADataType>{0.f, 16384.f, 1.f}(a_host);
// for (int i = 0 ; i < tokens; i++){
// for (int j = 0; j < hidden_size; j++) {
// a_host.mData[i * hidden_size + j] = ck_tile::type_convert<ADataType>(float(i+1) * 0.1 + float(i * j % 116) * 0.0012);
// }
// }
ck_tile
::
FillUniformDistribution
<
ADataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
a_host
);
ck_tile
::
FillUniformDistribution
<
GDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
g_host
);
ck_tile
::
FillUniformDistribution
<
DDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
d_host
);
ck_tile
::
FillUniformDistribution
<
AScaleDataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
sa_host
);
ck_tile
::
FillUniformDistribution
<
GScaleDataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
sg_host
);
ck_tile
::
FillUniformDistribution
<
DScaleDataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
sd_host
);
ck_tile
::
FillUniformDistribution
<
YSmoothScaleDataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
sy_host
);
ck_tile
::
FillUniformDistribution
<
TopkWeightDataType
>
{
-
.5
f
,
.5
f
,
seed
,
true
}(
topk_weight_host
);
// a_host.savetxt("a.txt");
// fill((ADataType *)a_host.mData.data(), a_host.size(), ck_tile::type_convert<ADataType>(0.1f));
// fill((GDataType *)g_host.mData.data(), g_host.size(), ck_tile::type_convert<GDataType>(0.1f));
// fill((DDataType *)d_host.mData.data(), d_host.size(), ck_tile::type_convert<DDataType>(0.1f));
...
...
@@ -336,19 +318,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
// fill((DScaleDataType *)sd_host.mData.data(), sd_host.size(), ck_tile::type_convert<DScaleDataType>(1.f));
// fill((YSmoothScaleDataType *)sy_host.mData.data(), sy_host.size(), ck_tile::type_convert<YSmoothScaleDataType>(1.f));
// fill((TopkWeightDataType *)topk_weight_host.mData.data(), topk_weight_host.size(), ck_tile::type_convert<TopkWeightDataType>(1.f));
// ck_tile::FillNormalDistribution<ADataType>{.1f, .1f, seed, true}(a_host);
// ck_tile::FillNormalDistribution<GDataType>{.1f, .1f, seed, true}(g_host);
// ck_tile::FillNormalDistribution<DDataType>{.1f, .1f, seed, true}(d_host);
// ck_tile::FillNormalDistribution<AScaleDataType>{1.f, 1.f, seed, true}(sa_host);
// ck_tile::FillNormalDistribution<GScaleDataType>{1.f, 1.f, seed, true}(sg_host);
// ck_tile::FillNormalDistribution<DScaleDataType>{1.f, 1.f, seed, true}(sd_host);
// ck_tile::FillNormalDistribution<YSmoothScaleDataType>{1.f, 1.f, seed, true}(sy_host);
// ck_tile::FillNormalDistribution<TopkWeightDataType>{1.f, 1.f, seed, true}(topk_weight_host);
// ck_tile::FillNormalDistribution<DDataType>{0.f, 1.f, seed, true}(d_host);
// ck_tile::FillNormalDistribution<DScaleDataType>{0.f, 1.f, seed, true}(sd_host);
// ck_tile::FillNormalDistribution<YSmoothScaleDataType>{0.f, 1.f, seed, true}(sy_host);
// ck_tile::FillNormalDistribution<TopkWeightDataType>{0.f, 1.f, seed, true}(topk_weight_host);
ck_tile
::
FillNormalDistribution
<
ADataType
>
{
0.
f
,
.1
f
,
seed
,
true
}(
a_host
);
ck_tile
::
FillNormalDistribution
<
GDataType
>
{
0.
f
,
.1
f
,
seed
,
true
}(
g_host
);
ck_tile
::
FillNormalDistribution
<
DDataType
>
{
0.
f
,
.1
f
,
seed
,
true
}(
d_host
);
ck_tile
::
FillNormalDistribution
<
AScaleDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
sa_host
);
ck_tile
::
FillNormalDistribution
<
GScaleDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
sg_host
);
ck_tile
::
FillNormalDistribution
<
DScaleDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
sd_host
);
ck_tile
::
FillNormalDistribution
<
YSmoothScaleDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
sy_host
);
ck_tile
::
FillNormalDistribution
<
TopkWeightDataType
>
{
0.
f
,
1.
f
,
seed
,
true
}(
topk_weight_host
);
}
// permute weight
...
...
@@ -369,10 +346,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
}
else
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
topk_ids_host
.
mData
.
size
());
i
++
)
{
topk_ids_host
.
mData
[
i
]
=
0
;
}
//
topid_unique_gen<IndexDataType>(topk_ids_host.mData, tokens, topk, experts, 11913);
//
for(int i = 0; i < static_cast<int>(topk_ids_host.mData.size()); i++) {
//
topk_ids_host.mData[i] = 0;
//
}
topid_unique_gen
<
IndexDataType
>
(
topk_ids_host
.
mData
,
tokens
,
topk
,
experts
,
11913
);
}
// leave it here for future debug purpose
...
...
include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x256_1x4x1_16x16x32_itl.hpp
View file @
78fbf32f
...
...
@@ -85,14 +85,6 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
register
bf16x2_t
v_debug
asm
(
"v160"
);
register
bf16x2_t
v_debug1
asm
(
"v161"
);
register
bf16x2_t
v_debug2
asm
(
"v162"
);
register
bf16x2_t
v_debug3
asm
(
"v163"
);
register
bf16x2_t
v_debug4
asm
(
"v164"
);
register
bf16x2_t
v_debug5
asm
(
"v165"
);
register
bf16x2_t
v_debug6
asm
(
"v166"
);
register
bf16x2_t
v_debug7
asm
(
"v167"
);
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
...
...
@@ -119,7 +111,6 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int
sfl_sld
=
(
lane_id
%
2
)
*
2
+
(
lane_id
/
2
)
*
(
64
+
4
)
+
(
threadIdx
.
x
/
64
)
*
4
;
sfl_sld
*=
2
;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
...
...
@@ -162,15 +153,7 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
[
c28
]
"+v"
(
v_c28
),
[
c29
]
"+v"
(
v_c29
),
[
c30
]
"+v"
(
v_c30
),
[
c31
]
"+v"
(
v_c31
),
[
debug0
]
"+v"
(
v_debug
),
[
debug1
]
"+v"
(
v_debug1
),
[
debug2
]
"+v"
(
v_debug2
),
[
debug3
]
"+v"
(
v_debug3
),
[
debug4
]
"+v"
(
v_debug4
),
[
debug5
]
"+v"
(
v_debug5
),
[
debug6
]
"+v"
(
v_debug6
),
[
debug7
]
"+v"
(
v_debug7
)
[
c31
]
"+v"
(
v_c31
)
:
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
...
...
@@ -197,10 +180,6 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
// [v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
// [v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
// [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
// [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
scale_0
]
"v"
(
s0
),
...
...
@@ -216,7 +195,7 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
[
s_execflag_6
]
"s"
(
o_flags
[
number
<
6
>
{}]),
[
s_execflag_7
]
"s"
(
o_flags
[
number
<
7
>
{}])
:
"memory"
,
"exec"
,
"m0"
,
"vcc"
,
"scc"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
...
...
@@ -247,7 +226,7 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"s56"
,
"s60"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v50"
,
"v54"
,
"v55"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
...
...
@@ -274,14 +253,6 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x256_
);
#pragma clang diagnostic pop
// clang-format on
// if(threadIdx.x==0) {
// printf("%d\n", threadIdx.x);
// }
if
(
threadIdx
.
x
==
0
)
{
printf
(
"%d
\n
"
,
threadIdx
.
x
);
}
// }
// __syncthreads();
}
};
...
...
@@ -355,14 +326,22 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x256_
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
register
fp16x2_t
v_debug
asm
(
"v160"
);
register
fp16x2_t
v_debug1
asm
(
"v161"
);
register
fp16x2_t
v_debug2
asm
(
"v162"
);
register
fp16x2_t
v_debug3
asm
(
"v163"
);
register
fp16x2_t
v_debug4
asm
(
"v164"
);
register
fp16x2_t
v_debug5
asm
(
"v165"
);
register
fp16x2_t
v_debug6
asm
(
"v166"
);
register
fp16x2_t
v_debug7
asm
(
"v167"
);
// register fp16x2_t v_debug0 asm("v160");
// register fp16x2_t v_debug1 asm("v161");
// register fp16x2_t v_debug2 asm("v162");
// register fp16x2_t v_debug3 asm("v163");
// register fp16x2_t v_debug4 asm("v164");
// register fp16x2_t v_debug5 asm("v165");
// register fp16x2_t v_debug6 asm("v166");
// register fp16x2_t v_debug7 asm("v167");
// register fp16x2_t v_debug8 asm("v168");
// register fp16x2_t v_debug9 asm("v169");
// register fp16x2_t v_debug10 asm("v170");
// register fp16x2_t v_debug11 asm("v171");
// register fp16x2_t v_debug12 asm("v172");
// register fp16x2_t v_debug13 asm("v173");
// register fp16x2_t v_debug14 asm("v174");
// register fp16x2_t v_debug15 asm("v175");
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
...
...
@@ -431,15 +410,24 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x256_
[
c28
]
"+v"
(
v_c28
),
[
c29
]
"+v"
(
v_c29
),
[
c30
]
"+v"
(
v_c30
),
[
c31
]
"+v"
(
v_c31
),
[
debug0
]
"+v"
(
v_debug
),
[
debug1
]
"+v"
(
v_debug1
),
[
debug2
]
"+v"
(
v_debug2
),
[
debug3
]
"+v"
(
v_debug3
),
[
debug4
]
"+v"
(
v_debug4
),
[
debug5
]
"+v"
(
v_debug5
),
[
debug6
]
"+v"
(
v_debug6
),
[
debug7
]
"+v"
(
v_debug7
)
[
c31
]
"+v"
(
v_c31
)
// ,
// [debug0]"+v"(v_debug0),
// [debug1]"+v"(v_debug1),
// [debug2]"+v"(v_debug2),
// [debug3]"+v"(v_debug3),
// [debug4]"+v"(v_debug4),
// [debug5]"+v"(v_debug5),
// [debug6]"+v"(v_debug6),
// [debug7]"+v"(v_debug7),
// [debug8 ]"+v"(v_debug8 ),
// [debug9 ]"+v"(v_debug9 ),
// [debug10]"+v"(v_debug10),
// [debug11]"+v"(v_debug11),
// [debug12]"+v"(v_debug12),
// [debug13]"+v"(v_debug13),
// [debug14]"+v"(v_debug14),
// [debug15]"+v"(v_debug15)
:
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
...
...
@@ -486,7 +474,8 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x256_
[
s_execflag_6
]
"s"
(
o_flags
[
number
<
6
>
{}]),
[
s_execflag_7
]
"s"
(
o_flags
[
number
<
7
>
{}])
:
"memory"
,
"exec"
,
"m0"
,
"vcc"
,
"scc"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"memory"
,
"exec"
,
"m0"
,
"vcc"
,
"scc"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
...
...
@@ -517,7 +506,7 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x256_
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"s36"
,
"s37"
,
"s59"
,
"s80"
,
"s56"
,
"s60"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v50"
,
"v54"
,
"v55"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
...
...
@@ -544,9 +533,6 @@ struct FlatmmSn_32x128x256_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x256_
);
#pragma clang diagnostic pop
// clang-format on
if
(
threadIdx
.
x
==
0
)
{
printf
(
"%d
\n
"
,
threadIdx
.
x
);
}
}
};
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x256_1x4x1_16x16x16_itl.inc
View file @
78fbf32f
...
...
@@ -126,7 +126,8 @@
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
_UK_MFMA_
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
_UK_MFMA_
...
...
@@ -226,6 +227,7 @@
" s_mov_b64 exec, s[38:39]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, %[s_tile_os_b], 0
\n
"
...
...
@@ -275,16 +277,16 @@
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_write_b64
v3
, v[64:65] offset:16640
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[64:65] offset:16640
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64
v3
, v[66:67] offset:20992
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[66:67] offset:20992
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_write_b64
v3
, v[68:69] offset:18816
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[68:69] offset:18816
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]
\n
"
" ds_write_b64
v3
, v[70:71] offset:23168
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[70:71] offset:23168
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
_UK_MFMA_
...
...
@@ -419,6 +421,7 @@
" s_mov_b64 exec, s[38:39]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
...
...
@@ -517,12 +520,14 @@
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
#undef _UK_MFMA_
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
View file @
78fbf32f
...
...
@@ -402,16 +402,16 @@
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_write_b64
v3
, v[64:65] offset:16640
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[64:65] offset:16640
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64
v3
, v[66:67] offset:20992
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[66:67] offset:20992
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_write_b64
v3
, v[68:69] offset:18816
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[68:69] offset:18816
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]
\n
"
" ds_write_b64
v3
, v[70:71] offset:23168
\n
"
_UK_MFMA_
" ds_write_b64
%[v_sfl_sst]
, v[70:71] offset:23168
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
_UK_MFMA_
...
...
script/cmake-ck-dev.sh
View file @
78fbf32f
...
...
@@ -17,7 +17,7 @@ fi
cmake
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm/
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_FLAGS
=
"-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker "
\
-D
CMAKE_CXX_FLAGS
=
"-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker
--save-temps -v
"
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
BUILD_DEV
=
ON
\
-D
GPU_TARGETS
=
$GPU_TARGETS
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment