Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f98de64a
Commit
f98de64a
authored
Jun 10, 2022
by
ltqin
Browse files
regular code
parent
6e3b47c3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
89 deletions
+52
-89
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
...operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+52
-89
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
View file @
f98de64a
...
...
@@ -442,7 +442,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
.
GetElementSpaceSize
(),
true
>
{};
},
Number
<
4
>
{});
Number
<
BaseMultK0
>
{});
const
auto
wave_k_m_id
=
GetWaveKMIdx
(
wave_id
[
I2
]);
auto
a_threadwise_copy
=
...
...
@@ -488,7 +488,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
.
GetElementSpaceSize
(),
true
>
{};
},
Number
<
4
>
{});
Number
<
BaseMultK0
>
{});
const
auto
wave_k_n_id
=
GetWaveKNIdx
(
wave_id
[
I2
]);
...
...
@@ -563,71 +563,54 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
{
// Read
auto
read_first_half_data
=
[
&
]()
{
a_threadwise_copy
.
Run
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_grid_buf
,
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
a_thread_buf
(
Number
<
0
>
{}));
a_threadwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_thread_slice_copy_step
);
b_threadwise_copy
.
Run
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_grid_buf
,
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
(
Number
<
0
>
{}));
b_threadwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_thread_slice_copy_step
);
a_threadwise_copy
.
Run
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_grid_buf
,
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
a_thread_buf
(
Number
<
1
>
{}));
a_threadwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_thread_slice_copy_step
);
b_threadwise_copy
.
Run
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_grid_buf
,
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
(
Number
<
1
>
{}));
b_threadwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_thread_slice_copy_step
);
static_for
<
0
,
BaseMultK0
/
2
,
1
>
{}([
&
](
auto
i
)
{
a_threadwise_copy
.
Run
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_grid_buf
,
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
a_thread_buf
(
Number
<
i
>
{}));
a_threadwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_thread_slice_copy_step
);
b_threadwise_copy
.
Run
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_grid_buf
,
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
(
Number
<
i
>
{}));
b_threadwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_thread_slice_copy_step
);
});
};
auto
read_last_half_data
=
[
&
]()
{
a_threadwise_copy
.
Run
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_grid_buf
,
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
a_thread_buf
(
Number
<
2
>
{}));
a_threadwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_thread_slice_copy_step
);
b_threadwise_copy
.
Run
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_grid_buf
,
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
(
Number
<
2
>
{}));
b_threadwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_thread_slice_copy_step
);
a_threadwise_copy
.
Run
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_grid_buf
,
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
a_thread_buf
(
Number
<
3
>
{}));
a_threadwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_thread_slice_copy_step
);
b_threadwise_copy
.
Run
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_grid_buf
,
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
(
Number
<
3
>
{}));
b_threadwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_thread_slice_copy_step
);
static_for
<
BaseMultK0
/
2
,
BaseMultK0
,
1
>
{}([
&
](
auto
i
)
{
a_threadwise_copy
.
Run
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_grid_buf
,
a_thread_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
a_thread_buf
(
Number
<
i
>
{}));
a_threadwise_copy
.
MoveSrcSliceWindow
(
a_grid_desc_k0_k1_k2_m0_m1_m2_m3_k3
,
a_thread_slice_copy_step
);
b_threadwise_copy
.
Run
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_grid_buf
,
b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_buf
(
Number
<
i
>
{}));
b_threadwise_copy
.
MoveSrcSliceWindow
(
b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3
,
b_thread_slice_copy_step
);
});
};
auto
run_first_half_gemm
=
[
&
]()
{
static_for
<
0
,
BaseMultK0
/
2
,
1
>
{}([
&
](
auto
i
)
{
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
i
>
{}),
b_thread_buf
(
Number
<
i
>
{}),
c_thread_buf
);
});
};
auto
run_last_half_gemm
=
[
&
]()
{
static_for
<
BaseMultK0
/
2
,
BaseMultK0
,
1
>
{}([
&
](
auto
i
)
{
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
i
>
{}),
b_thread_buf
(
Number
<
i
>
{}),
c_thread_buf
);
});
};
read_first_half_data
();
// Initialize C
...
...
@@ -641,24 +624,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
index_t
i
=
0
;
do
{
// 1st
read_last_half_data
();
s_nop
();
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
0
>
{}),
b_thread_buf
(
Number
<
0
>
{}),
c_thread_buf
);
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
1
>
{}),
b_thread_buf
(
Number
<
1
>
{}),
c_thread_buf
);
run_first_half_gemm
();
read_first_half_data
();
s_nop
();
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
2
>
{}),
b_thread_buf
(
Number
<
2
>
{}),
c_thread_buf
);
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
3
>
{}),
b_thread_buf
(
Number
<
3
>
{}),
c_thread_buf
);
run_last_half_gemm
();
i
+=
1
;
}
while
(
i
<
(
K0BlockMainLoop
-
1
));
...
...
@@ -666,18 +637,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
// tail
{
// 1st
read_last_half_data
();
s_nop
();
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
0
>
{}),
b_thread_buf
(
Number
<
0
>
{}),
c_thread_buf
);
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
1
>
{}),
b_thread_buf
(
Number
<
1
>
{}),
c_thread_buf
);
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
2
>
{}),
b_thread_buf
(
Number
<
2
>
{}),
c_thread_buf
);
blockwise_gemm
.
Run
(
a_thread_buf
(
Number
<
3
>
{}),
b_thread_buf
(
Number
<
3
>
{}),
c_thread_buf
);
run_first_half_gemm
();
run_last_half_gemm
();
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment