Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
cfbef574
"...composable_kernel.git" did not exist on "84dcf5d043251ec166f7428c1e2b294500492453"
Commit
cfbef574
authored
May 14, 2024
by
Adam Osewski
Browse files
Fix Coherency bits and for gmem ordering through compiler builtins.
parent
160932b6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
5 deletions
+25
-5
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
...grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
+8
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle_v2.hpp
.../grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle_v2.hpp
+12
-0
include/ck/utility/amd_buffer_addressing.hpp
include/ck/utility/amd_buffer_addressing.hpp
+5
-5
No files found.
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
View file @
cfbef574
...
@@ -164,6 +164,14 @@ __global__ void
...
@@ -164,6 +164,14 @@ __global__ void
// do CShuffle in flight with loading partials products of other peer workgroups.
// do CShuffle in flight with loading partials products of other peer workgroups.
GridwiseGemm
::
StorePartials
(
p_workspace
,
static_cast
<
void
*>
(
p_shared
),
results_buffer
);
GridwiseGemm
::
StorePartials
(
p_workspace
,
static_cast
<
void
*>
(
p_shared
),
results_buffer
);
#if 0
// make sure all writes to gmem has finished.
__builtin_amdgcn_s_waitcnt(0x0f70); // s_waitcnt vmcnt(0)
// __builtin_amdgcn_s_waitcnt(0x0070); // s_waitcnt vmcnt(0) lgkmcnt(0)
__builtin_amdgcn_s_barrier(); // s_barrier
// __builtin_amdgcn_sched_barrier(0x0001); // allow all non-memory instructions to pass
__builtin_amdgcn_sched_barrier(0);
#endif
work_scheduler
.
FlagFinished
();
work_scheduler
.
FlagFinished
();
// The workgroup which processed first K tile accumulates results and stores to GMEM
// The workgroup which processed first K tile accumulates results and stores to GMEM
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle_v2.hpp
View file @
cfbef574
...
@@ -1019,7 +1019,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
...
@@ -1019,7 +1019,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
MakeWorkspaceGridDesc_GridSize_MPerBlock_I1_NPerBlock
(
get_grid_size
());
MakeWorkspaceGridDesc_GridSize_MPerBlock_I1_NPerBlock
(
get_grid_size
());
auto
p_workspace_grid
=
reinterpret_cast
<
AccDataType
*>
(
p_workspace
);
auto
p_workspace_grid
=
reinterpret_cast
<
AccDataType
*>
(
p_workspace
);
auto
w_grid_buf
=
auto
w_grid_buf
=
#if(defined(__gfx908__) || defined(__gfx90a__))
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
GLC
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
GLC
>
(
#elif defined(__gfx94__)
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
SYSTEM_NT0
>
(
#else // for host
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
DefaultCoherence
>
(
#endif
p_workspace_grid
,
workspace_grid_desc_m0_m1_n0_n1
.
GetElementSpaceSize
());
p_workspace_grid
,
workspace_grid_desc_m0_m1_n0_n1
.
GetElementSpaceSize
());
// shuffle: blockwise copy C from LDS to workspace
// shuffle: blockwise copy C from LDS to workspace
...
@@ -1187,7 +1193,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
...
@@ -1187,7 +1193,13 @@ class GridwiseGemmMultipleD_xdl_splitk_cshuffle_v2
auto
p_workspace_grid
=
reinterpret_cast
<
CShuffleDataType
*>
(
p_workspace
);
auto
p_workspace_grid
=
reinterpret_cast
<
CShuffleDataType
*>
(
p_workspace
);
auto
w_grid_buf
=
auto
w_grid_buf
=
#if(defined(__gfx908__) || defined(__gfx90a__))
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
GLC
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
GLC
>
(
#elif defined(__gfx94__)
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
SYSTEM_NT0
>
(
#else // for host
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
,
AmdBufferCoherenceEnum
::
DefaultCoherence
>
(
#endif
p_workspace_grid
,
workspace_grid_desc_m0m1_n0n1n2
.
GetElementSpaceSize
());
p_workspace_grid
,
workspace_grid_desc_m0m1_n0n1n2
.
GetElementSpaceSize
());
auto
acc_load
=
ThreadwiseTensorSliceTransfer_v2
<
auto
acc_load
=
ThreadwiseTensorSliceTransfer_v2
<
...
...
include/ck/utility/amd_buffer_addressing.hpp
View file @
cfbef574
...
@@ -297,17 +297,17 @@ enum struct AmdBufferCoherenceEnum
...
@@ -297,17 +297,17 @@ enum struct AmdBufferCoherenceEnum
GLC
=
1
,
GLC
=
1
,
SLC
=
2
,
SLC
=
2
,
GLC_SLC
=
3
,
GLC_SLC
=
3
,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx94
2
: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0
=
0
,
WAVE_NT0
=
0
,
WAVE_NT1
=
2
,
WAVE_NT1
=
2
,
GROUP_NT0
=
1
,
GROUP_NT0
=
1
,
GROUP_NT1
=
3
,
GROUP_NT1
=
3
,
DEVICE_NT0
=
8
,
DEVICE_NT0
=
16
,
DEVICE_NT1
=
1
0
,
DEVICE_NT1
=
1
8
,
SYSTEM_NT0
=
9
,
SYSTEM_NT0
=
17
,
SYSTEM_NT1
=
1
1
,
SYSTEM_NT1
=
1
9
,
};
};
template
<
index_t
N
,
AmdBufferCoherenceEnum
coherence
=
AmdBufferCoherenceEnum
::
DefaultCoherence
>
template
<
index_t
N
,
AmdBufferCoherenceEnum
coherence
=
AmdBufferCoherenceEnum
::
DefaultCoherence
>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment