Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2f5ae075
Commit
2f5ae075
authored
May 10, 2023
by
carlushuang
Browse files
improve perf a little bit by swizzle block idx
parent
9860dad8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
49 additions
and
21 deletions
+49
-21
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
...sor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
+0
-6
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+41
-6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
...ensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+8
-9
No files found.
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
View file @
2f5ae075
...
@@ -202,12 +202,6 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
...
@@ -202,12 +202,6 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
hip_check_error
(
rtn
);
hip_check_error
(
rtn
);
num_cu
=
dev_prop
.
multiProcessorCount
;
num_cu
=
dev_prop
.
multiProcessorCount
;
printf
(
"XXX occupancy:%d, num_cu:%d, BLOCK_SIZE:%d, LDS:%d
\n
"
,
occupancy
,
num_cu
,
BlockSize
,
GridwiseGemm
::
GetSharedMemoryNumberOfByte
());
return
Argument
{
p_a
,
return
Argument
{
p_a
,
p_b
,
p_b
,
p_c
,
p_c
,
...
...
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
2f5ae075
...
@@ -635,7 +635,7 @@ struct BlockToCTileMap_3DGrid_KSplit
...
@@ -635,7 +635,7 @@ struct BlockToCTileMap_3DGrid_KSplit
return
true
;
return
true
;
}
}
};
};
#include <stdlib.h>
template
<
uint32_t
MPerBlock_
,
uint32_t
NPerBlock_
,
uint32_t
KPerBlock_
>
template
<
uint32_t
MPerBlock_
,
uint32_t
NPerBlock_
,
uint32_t
KPerBlock_
>
struct
BlockToCTileMap_GemmStreamK
struct
BlockToCTileMap_GemmStreamK
{
{
...
@@ -657,11 +657,22 @@ struct BlockToCTileMap_GemmStreamK
...
@@ -657,11 +657,22 @@ struct BlockToCTileMap_GemmStreamK
uint32_t
k_iters_per_big_block
;
uint32_t
k_iters_per_big_block
;
MDiv
k_iters_per_tile
;
MDiv
k_iters_per_tile
;
MDiv
n_tiles
;
MDiv
n_tiles
;
MDiv
tile_swizzle_sub_m
;
MDiv
tile_swizzle_sub_m_rem
;
//--------------------------------------
//--------------------------------------
static
int
env_get_int
(
const
char
*
var_name
,
int
default_int
)
{
char
*
v
=
getenv
(
var_name
);
int
r
=
default_int
;
if
(
v
)
r
=
atoi
(
v
);
return
r
;
}
// prefer construct on host
// prefer construct on host
BlockToCTileMap_GemmStreamK
(
BlockToCTileMap_GemmStreamK
(
uint32_t
m
,
uint32_t
n
,
uint32_t
k
,
uint32_t
num_cu
,
uint32_t
occupancy
)
uint32_t
m
,
uint32_t
n
,
uint32_t
k
,
uint32_t
num_cu
,
uint32_t
occupancy
,
uint32_t
tile_swizzle_sub_m_factor
=
8
)
{
{
uint32_t
num_tiles
=
uint32_t
num_tiles
=
math
::
integer_divide_ceil
(
m
,
MPerBlock
)
*
math
::
integer_divide_ceil
(
n
,
NPerBlock
);
math
::
integer_divide_ceil
(
m
,
MPerBlock
)
*
math
::
integer_divide_ceil
(
n
,
NPerBlock
);
...
@@ -760,6 +771,8 @@ struct BlockToCTileMap_GemmStreamK
...
@@ -760,6 +771,8 @@ struct BlockToCTileMap_GemmStreamK
sk_num_blocks
=
0
;
sk_num_blocks
=
0
;
}
}
sk_num_blocks
=
env_get_int
(
"sk_num_blocks"
,
sk_num_blocks
);
if
(
sk_num_blocks
==
0
)
if
(
sk_num_blocks
==
0
)
{
{
sk_num_big_blocks
=
0
;
sk_num_big_blocks
=
0
;
...
@@ -791,13 +804,17 @@ struct BlockToCTileMap_GemmStreamK
...
@@ -791,13 +804,17 @@ struct BlockToCTileMap_GemmStreamK
}
}
}
}
n_tiles
=
MDiv
(
math
::
integer_divide_ceil
(
n
,
NPerBlock
));
n_tiles
=
MDiv
(
math
::
integer_divide_ceil
(
n
,
NPerBlock
));
tile_swizzle_sub_m
=
MDiv
(
tile_swizzle_sub_m_factor
);
tile_swizzle_sub_m_rem
=
MDiv
(
math
::
integer_divide_ceil
(
m
,
MPerBlock
)
%
tile_swizzle_sub_m_factor
);
printf
(
"cu:%d, occupancy:%d, grids:%d, sk_num_big_blocks:%d, sk_num_blocks:%d, "
printf
(
"cu:%d, occupancy:%d, grids:%d,
num_tiles:%d, dp_tiles:%d,
sk_num_big_blocks:%d, sk_num_blocks:%d, "
"sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, "
"sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, "
"k_iters_per_tile:%d, k_iters_per_big_block:%d
\n
"
,
"k_iters_per_tile:%d, k_iters_per_big_block:%d
\n
"
,
num_cu
,
num_cu
,
occupancy
,
occupancy
,
get_grid_dims
().
x
,
get_grid_dims
().
x
,
num_tiles
,
dp_tiles
,
sk_num_big_blocks
,
sk_num_big_blocks
,
sk_num_blocks
,
sk_num_blocks
,
sk_total_iters
,
sk_total_iters
,
...
@@ -859,12 +876,30 @@ struct BlockToCTileMap_GemmStreamK
...
@@ -859,12 +876,30 @@ struct BlockToCTileMap_GemmStreamK
k_iters_per_tile
.
divmod
(
iter
,
tile_idx
,
iter_offset
);
k_iters_per_tile
.
divmod
(
iter
,
tile_idx
,
iter_offset
);
}
}
__device__
auto
tile_to_spatial
(
uint32_t
tile_idx
)
const
__device__
auto
tile_to_spatial
(
uint32_t
tile_idx
,
uint32_t
m
,
uint32_t
/*n*/
)
const
{
{
// TODO:
uint32_t
m_tile_idx
,
n_tile_idx
;
uint32_t
m_tile_idx
,
n_tile_idx
;
n_tiles
.
divmod
(
tile_idx
,
m_tile_idx
,
n_tile_idx
);
n_tiles
.
divmod
(
tile_idx
,
m_tile_idx
,
n_tile_idx
);
return
make_tuple
(
m_tile_idx
,
n_tile_idx
);
// return make_tuple(m_tile_idx, n_tile_idx);
// swizzle tile
uint32_t
m_tiles
=
math
::
integer_divide_ceil
(
m
,
MPerBlock
);
// uint32_t n_tiles = math::integer_divide_ceil(n, NPerBlock);
uint32_t
quo_sub_m
,
rem_sub_m
;
tile_swizzle_sub_m
.
divmod
(
m_tile_idx
,
quo_sub_m
,
rem_sub_m
);
const
auto
sub_m_adapt
=
(
m_tile_idx
<
(
m_tiles
-
tile_swizzle_sub_m_rem
.
get
()))
?
tile_swizzle_sub_m
:
tile_swizzle_sub_m_rem
;
uint32_t
m_tile_idx_sub0
,
m_tile_idx_sub1
;
tile_swizzle_sub_m
.
divmod
(
m_tile_idx
,
m_tile_idx_sub0
,
m_tile_idx_sub1
);
uint32_t
tile_idx_local
=
n_tile_idx
+
m_tile_idx_sub1
*
n_tiles
.
get
();
uint32_t
m_tile_idx_with_adapt
,
n_tile_idx_with_adapt
;
sub_m_adapt
.
divmod
(
tile_idx_local
,
n_tile_idx_with_adapt
,
m_tile_idx_with_adapt
);
return
make_tuple
(
m_tile_idx_with_adapt
+
m_tile_idx_sub0
*
tile_swizzle_sub_m
.
get
(),
n_tile_idx_with_adapt
);
}
}
};
};
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
View file @
2f5ae075
...
@@ -474,7 +474,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
...
@@ -474,7 +474,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
block_mapping
.
get_block_itr
(
block_idx
,
iter_start
,
iter_end
);
block_mapping
.
get_block_itr
(
block_idx
,
iter_start
,
iter_end
);
uint32_t
total_iter_length
=
iter_end
-
iter_start
;
uint32_t
total_iter_length
=
iter_end
-
iter_start
;
// if(threadIdx.x == 0)
// if(threadIdx.x == 0)
// printf("xxx bid:%d\n", static_cast<int>(blockIdx.x));
// printf("xxx bid:%d
, is_sk_block:%d, is_dp_block:%d
\n", static_cast<int>(blockIdx.x)
, is_sk_block, is_dp_block
);
if
(
!
is_sk_block
&&
!
is_dp_block
)
if
(
!
is_sk_block
&&
!
is_dp_block
)
return
;
return
;
...
@@ -485,7 +485,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
...
@@ -485,7 +485,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
uint32_t
tile_idx
,
iter_offset
;
uint32_t
tile_idx
,
iter_offset
;
block_mapping
.
get_tile_idx_with_offset
(
iter_end
-
1
,
tile_idx
,
iter_offset
);
block_mapping
.
get_tile_idx_with_offset
(
iter_end
-
1
,
tile_idx
,
iter_offset
);
iter_offset
=
__builtin_amdgcn_readfirstlane
(
iter_offset
-
current_iter_length
+
1
);
iter_offset
=
__builtin_amdgcn_readfirstlane
(
iter_offset
-
current_iter_length
+
1
);
auto
spatial_idx
=
block_mapping
.
tile_to_spatial
(
tile_idx
);
auto
spatial_idx
=
block_mapping
.
tile_to_spatial
(
tile_idx
,
m
,
n
);
const
index_t
m_block_data_idx_on_grid
=
const
index_t
m_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
spatial_idx
[
I0
]
*
MPerBlock
);
__builtin_amdgcn_readfirstlane
(
spatial_idx
[
I0
]
*
MPerBlock
);
...
@@ -496,13 +496,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
...
@@ -496,13 +496,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
const
index_t
k0_block_data_idx_on_grid
=
const
index_t
k0_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
iter_offset
*
K0PerBlock
);
__builtin_amdgcn_readfirstlane
(
iter_offset
*
K0PerBlock
);
// if(threadIdx.x == 0)
// if(threadIdx.x == 0)
// printf("[%s], bid:%d, block_idx:%d, tile_idx:%d(%d, %d, %d), iter_start:%d(%d |
// printf("[%s], bid:%d, block_idx:%d, tile_idx:%d(%d, %d, %d), iter_start:%d(%d | %d), iter_end:%d, len:%d\n",
// %d), iter_end:%d, len:%d\n",
// is_sk_block ? "sk_block" : (is_dp_block ? "dp_block" : "other "),
// is_sk_block ? "sk_block" : (is_dp_block ? "dp_block" : "other "),
// static_cast<int>(blockIdx.x), block_idx, tile_idx, m_block_data_idx_on_grid,
// static_cast<int>(blockIdx.x), block_idx, tile_idx, m_block_data_idx_on_grid,
// n_block_data_idx_on_grid, k0_block_data_idx_on_grid, iter_end -
// n_block_data_idx_on_grid, k0_block_data_idx_on_grid, iter_end -
// current_iter_length, iter_offset, iter_start, iter_end, current_iter_length);
// current_iter_length, iter_offset, iter_start, iter_end, current_iter_length);
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment