Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c2e5822c
Commit
c2e5822c
authored
Sep 07, 2022
by
Po-Yen, Chen
Browse files
Use the normal Block2TileMap convention
parent
ed794598
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
65 additions
and
33 deletions
+65
-33
include/ck/tensor_operation/gpu/device/device_permute.hpp
include/ck/tensor_operation/gpu/device/device_permute.hpp
+18
-18
include/ck/tensor_operation/gpu/grid/gridwise_copy.hpp
include/ck/tensor_operation/gpu/grid/gridwise_copy.hpp
+47
-15
No files found.
include/ck/tensor_operation/gpu/device/device_permute.hpp
View file @
c2e5822c
...
...
@@ -111,9 +111,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
}
static
auto
MakeDescriptor_N_H_W
(
const
std
::
array
<
index_t
,
NumDim
>&
lengths
,
const
std
::
array
<
index_t
,
NumDim
>&
stride
,
index_t
gridSize
,
index_t
blockSize
)
const
std
::
array
<
index_t
,
NumDim
>&
stride
)
{
// create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2],
// d[NumDim-1]]
...
...
@@ -139,8 +137,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
desc_n_h_w
,
make_tuple
(
NPerBlock
,
HPerBlock
,
WPerBlock
),
Sequence
<
true
,
true
,
true
>
{});
}
using
InGrid1dDesc
=
decltype
(
MakeDescriptor_N_H_W
({
1
,
1
},
{
1
,
1
}
,
1
,
1
));
using
OutGrid1dDesc
=
decltype
(
MakeDescriptor_N_H_W
({
1
,
1
},
{
1
,
1
}
,
1
,
1
));
using
InGrid1dDesc
=
decltype
(
MakeDescriptor_N_H_W
({
1
,
1
},
{
1
,
1
}));
using
OutGrid1dDesc
=
decltype
(
MakeDescriptor_N_H_W
({
1
,
1
},
{
1
,
1
}));
using
GridwiseCopy
=
GridwiseCopy
<
InGrid1dDesc
,
OutGrid1dDesc
,
...
...
@@ -164,23 +162,19 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
const
void
*
in_dev_buffer
,
void
*
out_dev_buffer
,
ElementwiseOperation
elementwise_op
)
:
blockSize_
(
256
),
gridSize_
(
120
),
// FIXME - Calculate the grid size by number of CU in the future
in_dev_buffer_
(
static_cast
<
InDataTypePointer
>
(
in_dev_buffer
)),
:
in_dev_buffer_
(
static_cast
<
InDataTypePointer
>
(
in_dev_buffer
)),
out_dev_buffer_
(
static_cast
<
OutDataTypePointer
>
(
out_dev_buffer
)),
in_grid_1d_desc_
(
MakeDescriptor_N_H_W
(
inLengths
,
inStrides
,
gridSize_
,
blockSize_
)),
out_grid_1d_desc_
(
MakeDescriptor_N_H_W
(
inLengths
,
inStrides
,
gridSize_
,
blockSize_
)),
in_grid_1d_desc_
(
MakeDescriptor_N_H_W
(
inLengths
,
inStrides
)),
out_grid_1d_desc_
(
MakeDescriptor_N_H_W
(
inLengths
,
inStrides
)),
inLengths_
(
inLengths
),
inStrides_
(
inStrides
),
outLengths_
(
outLengths
),
outStrides_
(
outStrides
),
elementwise_op_
(
elementwise_op
)
elementwise_op_
(
elementwise_op
),
block_2_tile_map_
(
GridwiseCopy
::
MakeDefaultBlock2TileMap
(
in_grid_1d_desc_
))
{
}
index_t
blockSize_
;
index_t
gridSize_
;
InDataTypePointer
in_dev_buffer_
;
OutDataTypePointer
out_dev_buffer_
;
InGrid1dDesc
in_grid_1d_desc_
;
...
...
@@ -192,29 +186,35 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
std
::
array
<
index_t
,
NumDim
>
outStrides_
;
ElementwiseOperation
elementwise_op_
;
typename
GridwiseCopy
::
DefaultBlock2TileMap
block_2_tile_map_
;
};
struct
Invoker
:
detail
::
InvokerBase
<
Invoker
,
Argument
>
{
static
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
index_t
grid_size
=
arg
.
block_2_tile_map_
.
CalculateGridSize
(
arg
.
in_grid_1d_desc_
);
const
auto
kernel
=
kernel_nd_copy
<
GridwiseCopy
,
InGrid1dDesc
,
OutGrid1dDesc
,
InDataTypePointer
,
OutDataTypePointer
,
ElementwiseOperation
>
;
ElementwiseOperation
,
typename
GridwiseCopy
::
DefaultBlock2TileMap
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
grid
S
ize
_
),
dim3
(
arg
.
b
lockSize
_
),
dim3
(
grid
_s
ize
),
dim3
(
B
lockSize
),
0
,
arg
.
in_grid_1d_desc_
,
arg
.
out_grid_1d_desc_
,
arg
.
in_dev_buffer_
,
arg
.
out_dev_buffer_
,
arg
.
elementwise_op_
);
arg
.
elementwise_op_
,
arg
.
block_2_tile_map_
);
return
elapsed_time
;
}
};
...
...
include/ck/tensor_operation/gpu/grid/gridwise_copy.hpp
View file @
c2e5822c
...
...
@@ -23,12 +23,28 @@ struct BlockToTileMap
static
constexpr
index_t
NumDim
=
TileDims
::
Size
();
static_assert
(
NumDim
==
GridDescriptor
::
GetNumOfDimension
());
BlockToTileMap
()
=
delete
;
~
BlockToTileMap
()
=
delete
;
BlockToTileMap
()
=
default
;
BlockToTileMap
(
const
GridDescriptor
&
desc
)
:
desc_
(
desc
)
{}
__host__
constexpr
index_t
CalculateGridSize
(
const
GridDescriptor
&
desc
)
const
{
return
[
&
]()
{
std
::
array
<
index_t
,
NumDim
>
num_tiles_per_axis
;
static_for
<
0
,
NumDim
,
1
>
{}([
&
](
auto
I
)
{
num_tiles_per_axis
[
I
]
=
math
::
integer_divide_ceil
(
desc
.
GetLength
(
I
),
TileDims
::
At
(
I
));
});
return
std
::
accumulate
(
begin
(
num_tiles_per_axis
),
end
(
num_tiles_per_axis
),
index_t
{
1
},
std
::
multiplies
<
index_t
>
{});
}();
}
template
<
typename
TopIdx
>
__host__
__device__
static
constexpr
auto
CalculateBottomIndex
(
const
GridDescriptor
&
desc
,
const
TopIdx
&
idx_top
)
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
static_assert
(
TopIdx
::
Size
()
==
1
);
...
...
@@ -36,7 +52,7 @@ struct BlockToTileMap
std
::
array
<
index_t
,
NumDim
>
num_tiles_per_axis
;
static_for
<
0
,
NumDim
,
1
>
{}([
&
](
auto
I
)
{
num_tiles_per_axis
[
I
]
=
math
::
integer_divide_ceil
(
desc
.
GetLength
(
I
),
TileDims
::
At
(
I
));
num_tiles_per_axis
[
I
]
=
math
::
integer_divide_ceil
(
desc
_
.
GetLength
(
I
),
TileDims
::
At
(
I
));
});
std
::
array
<
index_t
,
NumDim
>
divisors
;
...
...
@@ -54,6 +70,9 @@ struct BlockToTileMap
},
Number
<
NumDim
>
{});
}
private:
const
GridDescriptor
desc_
;
};
}
// namespace detail
...
...
@@ -62,15 +81,21 @@ template <typename GridwiseCopyFunctor,
typename
OutGrid1dDesc
,
typename
InDataTypePointer
,
typename
OutDataTypePointer
,
typename
ElementwiseOperation
>
typename
ElementwiseOperation
,
typename
Block2TileMap
>
__global__
void
kernel_nd_copy
(
const
InGrid1dDesc
in_grid_1d_desc
,
const
OutGrid1dDesc
out_grid_1d_desc
,
const
InDataTypePointer
p_in_global
,
const
OutDataTypePointer
p_out_global
,
const
ElementwiseOperation
elementwise_op
)
const
ElementwiseOperation
elementwise_op
,
const
Block2TileMap
block_2_tile_map
)
{
GridwiseCopyFunctor
::
Run
(
in_grid_1d_desc
,
out_grid_1d_desc
,
p_in_global
,
p_out_global
,
elementwise_op
);
GridwiseCopyFunctor
::
Run
(
in_grid_1d_desc
,
out_grid_1d_desc
,
p_in_global
,
p_out_global
,
elementwise_op
,
block_2_tile_map
);
}
template
<
typename
InGrid1dDesc
,
...
...
@@ -100,10 +125,10 @@ struct GridwiseCopy
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
using
Block
To
TileMap
=
using
Default
Block
2
TileMap
=
detail
::
BlockToTileMap
<
Sequence
<
NPerBlock
,
HPerBlock
,
WPerBlock
>
,
InGrid1dDesc
>
;
__host__
__device__
static
constexpr
auto
Get
A
BlockDescriptor
_AK0PerBlock_MPerBlock_AK1
()
__host__
__device__
static
constexpr
auto
Get
In
BlockDescriptor
()
{
constexpr
index_t
ABlockLdsExtraM
=
0
;
...
...
@@ -115,11 +140,18 @@ struct GridwiseCopy
I1
));
}
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2TileMap
(
const
InGrid1dDesc
&
desc
)
{
return
DefaultBlock2TileMap
{
desc
};
}
template
<
typename
Block2TileMap
>
__device__
static
void
Run
(
const
InGrid1dDesc
in_grid_1d_desc
,
const
OutGrid1dDesc
out_grid_1d_desc
,
const
InDataTypePointer
p_in_global
,
const
OutDataTypePointer
p_out_global
,
const
ElementwiseOperation
elementwise_op
)
const
ElementwiseOperation
elementwise_op
,
const
Block2TileMap
&
block_2_tile_map
)
{
const
index_t
thread_global_id
=
get_thread_global_1d_id
();
...
...
@@ -156,8 +188,8 @@ struct GridwiseCopy
1, // SrcScalarStrideInVector
false>{in_grid_1d_desc, thread_global_offset};
#else
const
auto
block_work_idx
=
BlockToTileMap
::
CalculateBottomIndex
(
in_grid_1d_desc
,
make_multi_index
(
get_block_1d_id
()));
const
auto
block_work_idx
=
block_2_tile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const
index_t
m_block_data_idx_on_grid
=
...
...
@@ -167,7 +199,7 @@ struct GridwiseCopy
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_block_desc_ak0_m_ak1
=
Get
A
BlockDescriptor
_AK0PerBlock_MPerBlock_AK1
();
constexpr
auto
a_block_desc_ak0_m_ak1
=
Get
In
BlockDescriptor
();
// // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment