Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
c8c016dd
Commit
c8c016dd
authored
Dec 13, 2024
by
aska-0096
Browse files
Merge branch 'develop' of
https://github.com/ROCm/composable_kernel
into update_cka8w8
parents
e8ca3daf
4e731776
Changes
399
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
843 additions
and
785 deletions
+843
-785
include/ck/library/utility/host_gemm.hpp
include/ck/library/utility/host_gemm.hpp
+0
-0
include/ck/library/utility/host_tensor.hpp
include/ck/library/utility/host_tensor.hpp
+1
-1
include/ck/library/utility/host_tensor_generator.hpp
include/ck/library/utility/host_tensor_generator.hpp
+25
-6
include/ck/library/utility/iterator.hpp
include/ck/library/utility/iterator.hpp
+0
-0
include/ck/library/utility/literals.hpp
include/ck/library/utility/literals.hpp
+0
-0
include/ck/library/utility/numeric.hpp
include/ck/library/utility/numeric.hpp
+0
-0
include/ck/library/utility/ranges.hpp
include/ck/library/utility/ranges.hpp
+0
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
...operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+64
-66
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
...operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+32
-33
include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
...de/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
+131
-1
include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
...sor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
+13
-37
include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp
...tion/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp
+0
-136
include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
...ensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
+18
-2
include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp
...or_operation/gpu/device/device_grouped_gemm_tile_loop.hpp
+6
-86
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
...n/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+309
-73
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
...device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+48
-138
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
...vice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+131
-163
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
...device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+1
-2
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
...ion/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+6
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
...grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+58
-35
No files found.
library/
include/ck/library/utility/host_gemm.hpp
→
include/ck/library/utility/host_gemm.hpp
View file @
c8c016dd
File moved
library/
include/ck/library/utility/host_tensor.hpp
→
include/ck/library/utility/host_tensor.hpp
View file @
c8c016dd
...
...
@@ -326,7 +326,7 @@ struct Tensor
std
::
size_t
GetElementSpaceSizeInBytes
()
const
{
return
sizeof
(
T
)
*
GetElementSpaceSize
();
}
void
SetZero
()
{
ck
::
ranges
::
fill
<
T
>
(
mData
,
0
);
}
void
SetZero
()
{
ck
::
ranges
::
fill
<
T
>
(
mData
,
T
{
0
}
);
}
template
<
typename
F
>
void
ForEach_impl
(
F
&&
f
,
std
::
vector
<
size_t
>&
idx
,
size_t
rank
)
...
...
library/
include/ck/library/utility/host_tensor_generator.hpp
→
include/ck/library/utility/host_tensor_generator.hpp
View file @
c8c016dd
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -37,7 +37,7 @@ struct GeneratorTensor_1<ck::half_t>
float
value
=
1.0
;
template
<
typename
...
Is
>
ck
::
b
half_t
operator
()(
Is
...)
ck
::
half_t
operator
()(
Is
...)
{
return
ck
::
type_convert
<
ck
::
half_t
>
(
value
);
}
...
...
@@ -62,7 +62,7 @@ struct GeneratorTensor_1<ck::f8_t>
float
value
=
1.0
;
template
<
typename
...
Is
>
ck
::
bhal
f_t
operator
()(
Is
...)
ck
::
f
8
_t
operator
()(
Is
...)
{
return
ck
::
type_convert
<
ck
::
f8_t
>
(
value
);
}
...
...
@@ -256,14 +256,33 @@ struct GeneratorTensor_Checkboard
}
};
template
<
ck
::
index_t
Dim
>
/**
* @brief Is used to generate sequential values based on the specified dimension.
*
* @tparam T The type of the tensor values.
* @tparam Dim The specific dimension used for generation.
*
* GeneratorTensor_Sequential<1>{} will generate the following values for a 3x3 tensor:
*
* 0 1 2
* 0 1 2
* 0 1 2
*
* Essentially, the values generated are logical coordinates of the generated element that
* correspond to dimension Dim. E.g. for 2-dimensional tensor and Dim=1, the values are the column
* indices.
*
*/
template
<
typename
T
,
ck
::
index_t
Dim
>
struct
GeneratorTensor_Sequential
{
template
<
typename
...
Ts
>
float
operator
()(
Ts
...
Xs
)
const
T
operator
()(
Ts
...
Xs
)
const
{
std
::
array
<
ck
::
index_t
,
sizeof
...(
Ts
)
>
dims
=
{{
static_cast
<
ck
::
index_t
>
(
Xs
)...}};
return
dims
[
Dim
];
float
tmp
=
dims
[
Dim
];
return
ck
::
type_convert
<
T
>
(
tmp
);
}
};
...
...
library/
include/ck/library/utility/iterator.hpp
→
include/ck/library/utility/iterator.hpp
View file @
c8c016dd
File moved
library/
include/ck/library/utility/literals.hpp
→
include/ck/library/utility/literals.hpp
View file @
c8c016dd
File moved
library/
include/ck/library/utility/numeric.hpp
→
include/ck/library/utility/numeric.hpp
View file @
c8c016dd
File moved
library/
include/ck/library/utility/ranges.hpp
→
include/ck/library/utility/ranges.hpp
View file @
c8c016dd
File moved
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
View file @
c8c016dd
...
...
@@ -278,9 +278,9 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
b_block_buf
,
b_thread_desc_
,
...
...
@@ -288,7 +288,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
b_thread_buf
);
});
});
});
static_for
<
0
,
KRepeat
,
1
>
{}([
&
](
auto
k0
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
...
...
@@ -350,6 +349,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
...
...
@@ -359,7 +359,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
b_thread_buf
);
});
});
});
static_for
<
0
,
KRepeat
,
1
>
{}([
&
](
auto
k0
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
...
...
@@ -405,6 +404,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
...
...
@@ -414,7 +414,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
b_thread_buf
);
});
});
});
a_thread_buf_tail
=
a_thread_buf
;
b_thread_buf_tail
=
b_thread_buf
;
...
...
@@ -431,6 +430,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
...
...
@@ -440,7 +440,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
b_thread_buf
);
});
});
});
a_thread_buf_tail
=
a_thread_buf
;
b_thread_buf_tail
=
b_thread_buf
;
...
...
@@ -1370,16 +1369,15 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k0
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k0
*
KPerInnerLoop
>
{}),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
n0
,
I0
,
k0
,
I0
),
b_thread_buf
);
});
});
__builtin_amdgcn_sched_barrier
(
0
);
// NOTE: Synchronize threads in a workgroup at the start of each MAC
// cluster, but except the first, as we can shorten non-MAC cluster a bit
...
...
@@ -1476,6 +1474,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k0
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k0
*
KPerInnerLoop
>
{}),
...
...
@@ -1484,7 +1483,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
make_tuple
(
n0
,
I0
,
k0
,
I0
),
b_thread_buf
);
});
});
__builtin_amdgcn_sched_barrier
(
0
);
if
constexpr
(
k0
.
value
!=
0
||
KRepeat
==
1
)
...
...
@@ -1552,6 +1550,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k0
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k0
*
KPerInnerLoop
>
{}),
...
...
@@ -1560,7 +1559,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
make_tuple
(
n0
,
I0
,
k0
,
I0
),
b_thread_buf
);
});
});
__builtin_amdgcn_sched_barrier
(
0
);
if
constexpr
(
k0
.
value
!=
0
||
KRepeat
==
1
)
...
...
@@ -1628,6 +1626,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k0
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k0
*
KPerInnerLoop
>
{}),
...
...
@@ -1636,7 +1635,6 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
make_tuple
(
n0
,
I0
,
k0
,
I0
),
b_thread_buf
);
});
});
__builtin_amdgcn_sched_barrier
(
0
);
if
constexpr
(
k0
.
value
!=
0
||
KRepeat
==
1
)
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
View file @
c8c016dd
...
...
@@ -603,6 +603,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_bufs
(
I0
));
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
...
...
@@ -612,7 +613,6 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
b_thread_bufs
(
I0
));
});
});
});
// Global prefetch 2
a_blockwise_copy
.
RunRead
(
a_grid_desc
,
a_grid_buf
);
...
...
@@ -656,9 +656,9 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_bufs
(
lds_read_reg_buf
));
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
b_block_buf
.
At
(
lds_read_buf
),
b_thread_desc_
,
...
...
@@ -666,7 +666,6 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
b_thread_bufs
(
lds_read_reg_buf
));
});
});
});
a_blockwise_copy
.
RunWrite
(
a_block_desc
,
a_block_buf
.
At
(
lds_write_buf
));
b_blockwise_copy
.
RunWrite
(
b_block_desc
,
b_block_buf
.
At
(
lds_write_buf
));
...
...
@@ -733,6 +732,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_bufs
(
lds_read_reg_buf
));
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
...
...
@@ -742,7 +742,6 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
b_thread_bufs
(
lds_read_reg_buf
));
});
});
});
a_blockwise_copy
.
RunWrite
(
a_block_desc
,
a_block_buf
.
At
(
lds_write_buf
));
b_blockwise_copy
.
RunWrite
(
b_block_desc
,
b_block_buf
.
At
(
lds_write_buf
));
...
...
@@ -789,6 +788,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
k
,
I0
),
a_thread_bufs
(
lds_read_reg_buf
));
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
Number
<
k
*
BMmaKStride
>
{}),
...
...
@@ -798,7 +798,6 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
b_thread_bufs
(
lds_read_reg_buf
));
});
});
});
static_for
<
0
,
KRepeat
,
1
>
{}([
&
](
auto
k0
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
View file @
c8c016dd
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <array>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <vector>
#include "device_base.hpp"
#include "ck/utility/ignore.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
///
/// @brief Structure representing single GEMM problem arguments.
///
/// The pointer to the vector of those structures is passed to the GroupedGEMM entry
/// point kernel.
///
/// @tparam NumDTensor The number of D input tensors.
///
template
<
index_t
NumDTensor
=
0
>
struct
GroupedGemmKernelArgument
{
__host__
__device__
GroupedGemmKernelArgument
(
const
void
*
p_a_grid_
,
const
void
*
p_b_grid_
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid_
,
void
*
p_e_grid_
,
index_t
M_
,
index_t
N_
,
index_t
K_
,
index_t
StrideA_
,
index_t
StrideB_
,
std
::
array
<
index_t
,
NumDTensor
>
StrideDs_
,
index_t
StrideE_
)
:
p_a_grid
{
p_a_grid_
},
p_b_grid
{
p_b_grid_
},
p_ds_grid
{
p_ds_grid_
},
p_e_grid
{
p_e_grid_
},
M
{
M_
},
N
{
N_
},
K
{
K_
},
StrideA
{
StrideA_
},
StrideB
{
StrideB_
},
StrideDs
{
StrideDs_
},
StrideE
{
StrideE_
}
{
}
const
void
*
p_a_grid
;
const
void
*
p_b_grid
;
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid
;
void
*
p_e_grid
;
index_t
M
;
index_t
N
;
index_t
K
;
index_t
StrideA
;
index_t
StrideB
;
std
::
array
<
index_t
,
NumDTensor
>
StrideDs
;
index_t
StrideE
;
void
Print
()
const
{
std
::
stringstream
str
;
for
(
auto
sd
:
StrideDs
)
str
<<
sd
<<
","
;
std
::
cout
<<
"arg {"
<<
"M:"
<<
M
<<
", "
<<
"N:"
<<
N
<<
", "
<<
"K:"
<<
K
<<
", "
<<
"SA:"
<<
StrideA
<<
", "
<<
"SB:"
<<
StrideB
<<
", "
<<
"SE:"
<<
StrideE
<<
", "
<<
"SDs: {"
<<
str
.
str
()
<<
"}"
<<
"}"
<<
std
::
endl
;
}
};
struct
GemmDesc
{
ck
::
index_t
M_
,
N_
,
K_
;
...
...
@@ -48,6 +118,66 @@ struct DeviceGroupedGemm : public BaseOperator
CElementwiseOperation
c_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
//---------------------------------------------------------------------------------------------
/// @brief Sets the device kernel arguments pointer and may copy data to device.
///
/// TODO: Add which kernels are using this (TileLoop * FixedNK ??)
///
/// @param p_arg The pointer to the Argument we're going to update.
/// @param[in] p_dev_kernel_args The pointer to the device memory which will contain kernel
/// arguments.
/// @param[in] p_host_kernel_args The pointer to the host memory which contains kernel
/// arguments that should be copied to device memory.
///
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
void
*
p_dev_kernel_args
,
const
void
*
p_host_kernel_args
)
const
{
ignore
=
p_arg
;
ignore
=
p_dev_kernel_args
;
ignore
=
p_host_kernel_args
;
std
::
ostringstream
err
;
err
<<
"This function is not implemented by the kernel: "
<<
this
->
GetTypeString
()
<<
__FILE__
<<
":"
<<
__LINE__
<<
", in function: "
<<
__func__
;
throw
std
::
runtime_error
(
err
.
str
());
}
//----------------------------------------------------------------------------------------------
/// @brief Sets the device kernel arguments pointer and may copy data to device.
///
/// @param p_arg The pointer to the Argument we're going to update.
/// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel
/// arguments.
///
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
void
*
p_dev_kernel_args
)
const
{
ignore
=
p_arg
;
ignore
=
p_dev_kernel_args
;
std
::
ostringstream
err
;
err
<<
"This function is not implemented by the kernel: "
<<
this
->
GetTypeString
()
<<
__FILE__
<<
":"
<<
__LINE__
<<
", in function: "
<<
__func__
;
throw
std
::
runtime_error
(
err
.
str
());
}
//----------------------------------------------------------------------------------------------
/// @brief Gets the device kernel argument size.
///
/// @param[in] p_arg The pointer to the Device op Argument.
///
/// @return The device kernel argument size.
///
virtual
size_t
GetDeviceKernelArgSize
(
const
BaseArgument
*
p_arg
)
const
{
ignore
=
p_arg
;
std
::
ostringstream
err
;
err
<<
"This function is not implemented by the kernel: "
<<
this
->
GetTypeString
()
<<
__FILE__
<<
":"
<<
__LINE__
<<
", in function: "
<<
__func__
;
throw
std
::
runtime_error
(
err
.
str
());
}
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp
View file @
c8c016dd
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <array>
#include "device_grouped_gemm.hpp"
#include "device_grouped_gemm_splitk.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
index_t
NumDTensor
=
0
>
struct
GroupedGemmKernelArgument
{
const
void
*
p_a_grid
;
const
void
*
p_b_grid
;
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid
;
void
*
p_e_grid
;
index_t
M
;
index_t
N
;
index_t
K
;
index_t
StrideA
;
index_t
StrideB
;
std
::
array
<
index_t
,
NumDTensor
>
StrideDs
;
index_t
StrideE
;
};
template
<
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
...
...
@@ -41,7 +20,7 @@ template <typename ALayout,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
DeviceGroupedGemmFixedNK
:
DeviceGroupedGemm
<
ALayout
,
struct
DeviceGroupedGemmFixedNK
:
DeviceGroupedGemm
SplitK
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
...
...
@@ -53,9 +32,6 @@ struct DeviceGroupedGemmFixedNK : DeviceGroupedGemm<ALayout,
BElementwiseOperation
,
CElementwiseOperation
>
{
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
const
void
*
kernel_args
)
const
=
0
;
virtual
size_t
GetDeviceKernelArgSize
(
const
BaseArgument
*
p_arg
)
const
=
0
;
virtual
void
SetKBatch
(
BaseArgument
*
p_arg
,
index_t
k_batch
)
const
=
0
;
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp
deleted
100644 → 0
View file @
e8ca3daf
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <array>
#include <iostream>
#include <vector>
#include <sstream>
#include "device_grouped_gemm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
///
/// @brief Structure representing single GEMM problem arguments.
///
/// The pointer to the vector of those structures is passed to the GroupedGEMM entry
/// point kernel.
///
/// @tparam NumDTensor The number of D input tensors.
///
template
<
index_t
NumDTensor
=
0
>
struct
GroupedGemmMultipleDKernelArguments
{
__host__
__device__
GroupedGemmMultipleDKernelArguments
(
const
void
*
p_a_grid_
,
const
void
*
p_b_grid_
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid_
,
void
*
p_e_grid_
,
index_t
M_
,
index_t
N_
,
index_t
K_
,
index_t
StrideA_
,
index_t
StrideB_
,
std
::
array
<
index_t
,
NumDTensor
>
StrideDs_
,
index_t
StrideE_
)
:
p_a_grid
{
p_a_grid_
},
p_b_grid
{
p_b_grid_
},
p_ds_grid
{
p_ds_grid_
},
p_e_grid
{
p_e_grid_
},
M
{
M_
},
N
{
N_
},
K
{
K_
},
StrideA
{
StrideA_
},
StrideB
{
StrideB_
},
StrideDs
{
StrideDs_
},
StrideE
{
StrideE_
}
{
}
const
void
*
p_a_grid
;
const
void
*
p_b_grid
;
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid
;
void
*
p_e_grid
;
index_t
M
;
index_t
N
;
index_t
K
;
index_t
StrideA
;
index_t
StrideB
;
std
::
array
<
index_t
,
NumDTensor
>
StrideDs
;
index_t
StrideE
;
void
Print
()
const
{
std
::
stringstream
str
;
for
(
auto
sd
:
StrideDs
)
str
<<
sd
<<
","
;
std
::
cout
<<
"arg {"
<<
"M:"
<<
M
<<
", "
<<
"N:"
<<
N
<<
", "
<<
"K:"
<<
K
<<
", "
<<
"SA:"
<<
StrideA
<<
", "
<<
"SB:"
<<
StrideB
<<
", "
<<
"SE:"
<<
StrideE
<<
", "
<<
"SDs: {"
<<
str
.
str
()
<<
"}"
<<
"}"
<<
std
::
endl
;
}
};
template
<
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
>
struct
DeviceGroupedGemmMultipleDSplitK
:
public
DeviceGroupedGemm
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CDEElementwiseOperation
>
{
//----------------------------------------------------------------------------------------------
/// @brief Sets the k batch size.
///
/// @param p_arg Pointer to the Argument we're going to change.
/// @param[in] kbatch The kbatch value.
///
virtual
void
SetKBatchSize
(
BaseArgument
*
p_arg
,
index_t
kbatch
)
const
=
0
;
//----------------------------------------------------------------------------------------------
/// @brief Sets the device kernel arguments pointer.
///
/// @param p_arg The pointer to the Argument we're going to update.
/// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel
/// arguments.
///
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
void
*
p_dev_kernel_args
)
const
=
0
;
//----------------------------------------------------------------------------------------------
/// @brief Gets the device kernel argument size.
///
/// @param[in] p_arg The pointer to the Device op Argument.
///
/// @return The device kernel argument size.
///
virtual
size_t
GetDeviceKernelArgSize
(
const
BaseArgument
*
p_arg
)
const
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp
View file @
c8c016dd
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "device_grouped_gemm.hpp"
...
...
@@ -31,7 +31,23 @@ struct DeviceGroupedGemmSplitK : public DeviceGroupedGemm<ALayout,
BElementwiseOperation
,
CElementwiseOperation
>
{
//----------------------------------------------------------------------------------------------
/// @brief Sets the k batch size.
///
/// @param p_arg Pointer to the Argument we're going to change.
/// @param[in] kbatch The kbatch value.
///
virtual
void
SetKBatchSize
(
BaseArgument
*
p_arg
,
index_t
kbatch
)
const
=
0
;
//----------------------------------------------------------------------------------------------
/// @brief Sets the k batch size.
///
/// @param p_arg Pointer to the Argument we're going to change.
/// @param[in] kbatch The kbatch value.
///
virtual
void
SetKBatch
(
BaseArgument
*
p_arg
,
index_t
kbatch
)
const
{
this
->
SetKBatchSize
(
p_arg
,
kbatch
);
};
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp
View file @
c8c016dd
...
...
@@ -3,83 +3,20 @@
#pragma once
#include <array>
#include <iostream>
#include <vector>
#include <sstream>
#include "device_grouped_gemm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
/// @brief Grouped GEMM kernel using output Tile Looping algorithm
///
/// @brief Structure representing single GEMM problem arguments.
///
/// The pointer to the vector of those structures is passed to the GroupedGEMM entry
/// point kernel.
///
/// @tparam NumDTensor The number of D input tensors.
/// @par This kernel does not require any knowledge about input data sizes (GEMM M/N/K)
/// It requires only the number of groups to launch. Other information like
/// data pointers and GEMM sizes, packed into gemm kernel args may be all dynamic
/// (known only at kernel run-time).
///
template
<
index_t
NumDTensor
=
0
>
struct
GroupedGemmTileLoopKernelArguments
{
__host__
__device__
GroupedGemmTileLoopKernelArguments
(
const
void
*
p_a_grid_
,
const
void
*
p_b_grid_
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid_
,
void
*
p_e_grid_
,
index_t
M_
,
index_t
N_
,
index_t
K_
,
index_t
StrideA_
,
index_t
StrideB_
,
std
::
array
<
index_t
,
NumDTensor
>
StrideDs_
,
index_t
StrideE_
)
:
p_a_grid
{
p_a_grid_
},
p_b_grid
{
p_b_grid_
},
p_ds_grid
{
p_ds_grid_
},
p_e_grid
{
p_e_grid_
},
M
{
M_
},
N
{
N_
},
K
{
K_
},
StrideA
{
StrideA_
},
StrideB
{
StrideB_
},
StrideDs
{
StrideDs_
},
StrideE
{
StrideE_
}
{
}
const
void
*
p_a_grid
;
const
void
*
p_b_grid
;
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid
;
void
*
p_e_grid
;
index_t
M
;
index_t
N
;
index_t
K
;
index_t
StrideA
;
index_t
StrideB
;
std
::
array
<
index_t
,
NumDTensor
>
StrideDs
;
index_t
StrideE
;
void
Print
()
const
{
std
::
stringstream
str
;
for
(
auto
sd
:
StrideDs
)
str
<<
sd
<<
","
;
std
::
cout
<<
"arg {"
<<
"M:"
<<
M
<<
", "
<<
"N:"
<<
N
<<
", "
<<
"K:"
<<
K
<<
", "
<<
"SA:"
<<
StrideA
<<
", "
<<
"SB:"
<<
StrideB
<<
", "
<<
"SE:"
<<
StrideE
<<
", "
<<
"SDs: {"
<<
str
.
str
()
<<
"}"
<<
"}"
<<
std
::
endl
;
}
};
/// @note This kernel does not support SplitK.
template
<
typename
ALayout
,
typename
BLayout
,
...
...
@@ -104,23 +41,6 @@ struct DeviceGroupedGemmTileLoop : public DeviceGroupedGemm<ALayout,
BElementwiseOperation
,
CDEElementwiseOperation
>
{
//----------------------------------------------------------------------------------------------
/// @brief Sets the device kernel arguments pointer.
///
/// @param p_arg The pointer to the Argument we're going to update.
/// @param[in] p_dev_kernel_args The pointer to the device memory which contains kernel
/// arguments.
///
virtual
void
SetDeviceKernelArgs
(
BaseArgument
*
p_arg
,
void
*
p_dev_kernel_args
)
const
=
0
;
//----------------------------------------------------------------------------------------------
/// @brief Gets the device kernel argument size.
///
/// @param[in] p_arg The pointer to the Device op Argument.
///
/// @return The device kernel argument size.
///
virtual
size_t
GetDeviceKernelArgSize
(
const
BaseArgument
*
p_arg
)
const
=
0
;
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
100644 → 100755
View file @
c8c016dd
...
...
@@ -131,6 +131,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
if
(
stream_config
.
log_level_
>
0
)
{
arg
.
Print
();
...
...
@@ -147,26 +148,27 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
index_t
K_split
=
(
arg
.
K
+
k_grain
-
1
)
/
k_grain
*
KPerBlock
;
const
bool
has_main_k_block_loop
=
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K_split
);
hipGetErrorString
(
hipMemsetAsync
(
if
constexpr
(
GridwiseGemm
::
Block2CTileMap_streamk
::
ReductionStrategy
==
StreamKReductionStrategy
::
Atomic
)
{
hip_check_error
(
hipMemsetAsync
(
arg
.
p_c_grid
,
0
,
arg
.
M
*
arg
.
N
*
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
}
const
auto
Run
=
[
&
](
const
auto
&
kernel
)
{
dim3
grid_dim
;
if
(
arg
.
Grid_size
<
0
)
{
int
occupancy
,
num_cu
;
hipError_t
rtn
;
rtn
=
hipOccupancyMaxActiveBlocksPerMultiprocessor
(
&
occupancy
,
kernel
,
BlockSize
,
0
);
hip_check_error
(
rtn
);
hip_check_error
(
hipOccupancyMaxActiveBlocksPerMultiprocessor
(
&
occupancy
,
kernel
,
BlockSize
,
0
));
hipDeviceProp_t
dev_prop
;
hipDevice_t
dev
;
rtn
=
hipGetDevice
(
&
dev
);
hip_check_error
(
rtn
);
rtn
=
hipGetDeviceProperties
(
&
dev_prop
,
dev
);
hip_check_error
(
rtn
);
hip_check_error
(
hipGetDevice
(
&
dev
));
hip_check_error
(
hipGetDeviceProperties
(
&
dev_prop
,
dev
));
num_cu
=
dev_prop
.
multiProcessorCount
;
arg
.
Grid_size
=
num_cu
*
occupancy
;
grid_dim
=
arg
.
Grid_size
;
}
...
...
@@ -196,9 +198,32 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
else
{
if
constexpr
(
GridwiseGemm
::
Block2CTileMap_streamk
::
ReductionStrategy
==
StreamKReductionStrategy
::
Atomic
)
{
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
grid_dim
,
dim3
(
BlockSize
),
0
,
arg
);
}
else
if
constexpr
(
GridwiseGemm
::
Block2CTileMap_streamk
::
ReductionStrategy
==
StreamKReductionStrategy
::
Reduction
)
{
char
*
workspace_semaphore
=
reinterpret_cast
<
char
*>
(
arg
.
p_workspace_
)
+
arg
.
block_2_ctile_map_streamk
.
get_workspace_size_for_acc
(
sizeof
(
GemmAccDataType
));
auto
preprocess
=
[
&
]()
{
hipMemsetAsync
(
workspace_semaphore
,
0
,
// sizeof(uint32_t),
arg
.
block_2_ctile_map_streamk
.
get_workspace_size_for_semaphore
(),
stream_config
.
stream_id_
);
};
ave_time
=
launch_and_time_kernel_with_preprocess
(
stream_config
,
preprocess
,
kernel
,
grid_dim
,
dim3
(
BlockSize
),
0
,
arg
);
}
}
};
constexpr
index_t
minimum_occupancy
=
...
...
@@ -211,15 +236,13 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v3
)
{
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
>
;
Run
(
kernel
);
}
}
// Tail number could be One to Seven
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v2
)
{
...
...
@@ -340,7 +363,6 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v4
)
{
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
{
const
auto
kernel
=
...
...
@@ -362,11 +384,9 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
Run
(
kernel
);
}
}
}
else
{
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
{
const
auto
kernel
=
...
...
@@ -389,23 +409,19 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
}
}
}
}
else
{
// Tail number always 1
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v1
)
{
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
false
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
>
;
Run
(
kernel
);
}
}
}
return
ave_time
;
}
...
...
@@ -418,6 +434,29 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
}
};
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
pArg
)
const
override
{
const
Argument
*
p_arg
=
dynamic_cast
<
const
Argument
*>
(
pArg
);
if
constexpr
(
GridwiseGemm
::
Block2CTileMap_streamk
::
ReductionStrategy
==
StreamKReductionStrategy
::
Reduction
)
{
return
p_arg
->
block_2_ctile_map_streamk
.
get_workspace_size
(
sizeof
(
GemmAccDataType
));
}
else
{
return
0
;
}
}
void
SetWorkSpacePointer
(
BaseArgument
*
pArg
,
void
*
p_workspace
,
const
StreamConfig
&
=
StreamConfig
{})
const
override
{
Argument
*
pArg_
=
dynamic_cast
<
Argument
*>
(
pArg
);
pArg_
->
p_workspace_
=
p_workspace
;
}
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
...
...
@@ -464,8 +503,205 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
CElementwiseOperation
)
{
return
Argument
{
p_a
,
p_b
,
p_c
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
streamk_sel
,
Grid_size
};
// HS
constexpr
index_t
minimum_occupancy
=
BlkGemmPipeSched
==
BlockGemmPipelineScheduler
::
Intrawave
?
1
:
2
;
index_t
K_split
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
*
KPerBlock
;
const
bool
has_main_k_block_loop
=
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K_split
);
int
occupancy
,
num_cu
;
const
auto
calculate_grid_size
=
[
&
](
const
auto
&
kernel
)
{
hip_check_error
(
hipOccupancyMaxActiveBlocksPerMultiprocessor
(
&
occupancy
,
kernel
,
BlockSize
,
0
));
hipDeviceProp_t
dev_prop
;
hipDevice_t
dev
;
hip_check_error
(
hipGetDevice
(
&
dev
));
hip_check_error
(
hipGetDeviceProperties
(
&
dev_prop
,
dev
));
num_cu
=
dev_prop
.
multiProcessorCount
;
Grid_size
=
num_cu
*
occupancy
;
};
if
(
has_main_k_block_loop
)
{
// Tail number always full
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v1
||
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v3
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
>
;
calculate_grid_size
(
kernel
);
}
// Tail number could be One to Seven
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v2
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
One
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
One
>
;
calculate_grid_size
(
kernel
);
}
else
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Full
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Full
>
;
calculate_grid_size
(
kernel
);
}
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
2
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Two
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Two
>
;
calculate_grid_size
(
kernel
);
}
}
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
3
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Three
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Three
>
;
calculate_grid_size
(
kernel
);
}
}
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
4
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Four
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Four
>
;
calculate_grid_size
(
kernel
);
}
}
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
5
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Five
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Five
>
;
calculate_grid_size
(
kernel
);
}
}
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
6
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Six
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Six
>
;
calculate_grid_size
(
kernel
);
}
}
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
7
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Seven
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Seven
>
;
calculate_grid_size
(
kernel
);
}
}
}
// Tail number could be Odd or Even
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v4
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3_2lds
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Odd
>
;
calculate_grid_size
(
kernel
);
}
else
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3_2lds
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Even
>
;
calculate_grid_size
(
kernel
);
}
}
else
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Odd
>
;
calculate_grid_size
(
kernel
);
}
else
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
,
TailNumber
::
Even
>
;
calculate_grid_size
(
kernel
);
}
}
}
else
{
// Tail number always 1
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v1
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
false
,
InMemoryDataOperationEnum
::
Set
,
minimum_occupancy
>
;
calculate_grid_size
(
kernel
);
}
}
return
Argument
{
p_a
,
p_b
,
p_c
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
streamk_sel
,
Grid_size
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
View file @
c8c016dd
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -106,8 +106,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
index_t
KPerBlock
=
K0PerBlock
*
K1
;
static
constexpr
auto
transform_conv_to_gemm
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
using
ConvToGemmBwdDataTransform
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
K1
,
K1
,
...
...
@@ -115,80 +114,27 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
NPerBlock
,
KPerBlock
,
true
/* DoPadGemmM */
,
true
/* DoPadGemmN */
>
{};
true
/* DoPadGemmN */
,
ALayout
,
BLayout
,
ELayout
>
;
static
auto
GetDummyABDsEGridDescriptor
()
static
auto
GetDummyABDsEGridDescriptor
(
const
ConvToGemmBwdDataTransform
&
conv_to_gemm_transform
)
{
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_lengths
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_strides
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
>
dummy_spatial_lengths
=
{
1
};
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
ds_grid_desc_m_n
=
generate_tuple
(
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
return
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
},
const
auto
a_grid_desc_ak0_m_ak1
=
conv_to_gemm_transform
.
MakeADescriptor_AK0_M_AK1
();
const
auto
b_grid_desc_bk0_n_bk1
=
conv_to_gemm_transform
.
MakeBDescriptor_BK0_N_BK1
();
const
auto
ds_grid_desc_m_n
=
generate_tuple
([
&
](
auto
)
{
return
conv_to_gemm_transform
.
MakeCDescriptor_M_N
();
},
Number
<
NumDTensor
>
{});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform
.
MakeCDescriptor_M_N
();
return
make_tuple
(
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
}
// desc
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
());
constexpr
static
ConvToGemmBwdDataTransform
dummy_conv_to_gemm_transform
;
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
(
dummy_conv_to_gemm_transform
));
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
...
...
@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_lengths
,
/*
ds_g_n_c_wis_lengths
*/
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_lengths
,
...
...
@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_strides_
{
a_g_n_k_wos_strides
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_strides_
{
b_g_k_c_xs_strides
},
ds_g_n_c_wis_lengths_
{
ds_g_n_c_wis_lengths
},
ds_g_n_c_wis_strides_
{
ds_g_n_c_wis_strides
},
e_g_n_c_wis_lengths_
{
e_g_n_c_wis_lengths
},
e_g_n_c_wis_strides_
{
e_g_n_c_wis_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
{
...
...
@@ -382,9 +321,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
tildes
=
{
i_ztilde
,
i_ytilde
,
i_xtilde
};
}
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
a_g_n_k_wos_lengths
,
ConvToGemmBwdDataTransform
conv_to_gemm_transform_
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
...
...
@@ -394,56 +331,37 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
tildes
};
const
auto
a_grid_desc_ak0_m_ak1
=
conv_to_gemm_transform_
.
MakeADescriptor_AK0_M_AK1
();
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
conv_to_gemm_transform_
.
MakeBDescriptor_BK0_N_BK1
();
DsGridDesc_M_N
ds_grid_desc_m_n
;
// populate Ds desc
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
ds_grid_desc_m_n
(
i
)
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
static_assert
(
is_same_v
<
DLayout
,
ELayout
>
);
ConvToGemmBwdDataTransform
conv_to_gemm_transform_d
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
ds
_g_n_c_wis_lengths
[
i
]
,
e
_g_n_c_wis_lengths
,
ds_g_n_c_wis_strides
[
i
],
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
tildes
};
ds_grid_desc_m_n
(
i
)
=
conv_to_gemm_transform_d
.
MakeCDescriptor_M_N
();
});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform_
.
MakeCDescriptor_M_N
();
// for check validity
ds_grid_desc_m_n_container_
.
push_back
(
ds_grid_desc_m_n
);
...
...
@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
BElementwiseOp
b_element_op_
;
CDEElementwiseOp
cde_element_op_
;
// for checking IsSupportedArgument()
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_lengths_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
View file @
c8c016dd
...
...
@@ -54,15 +54,16 @@ template <typename GridwiseGemm,
typename
ABDataType
,
typename
DsPointer
,
typename
EDataType
,
typename
AElementwiseOp
eration
,
typename
BElementwiseOp
eration
,
typename
CDEElementwiseOp
eration
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfBatch
,
typename
ComputePtrOffsetOfN
,
bool
HasMainKBlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
...
...
@@ -73,10 +74,9 @@ __global__ void
const
ABDataType
*
__restrict__
p_b_grid
,
DsPointer
p_ds_grid
,
EDataType
*
__restrict__
p_e_grid
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CDEElementwiseOperation
cde_element_op
,
const
index_t
batch_count
,
const
AElementwiseOp
a_element_op
,
const
BElementwiseOp
b_element_op
,
const
CDEElementwiseOp
cde_element_op
,
const
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
,
const
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
...
...
@@ -84,24 +84,29 @@ __global__ void
const
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
e_grid_desc_mblock_mperblock_nblock_nperblock_
,
const
Block2ETileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
const
ComputePtrOffsetOfN
compute_ptr_offset_of_n
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx94__))
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
index_t
n_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
);
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
))
)
;
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
))
)
;
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
))
)
;
const
long_index_t
a_batch_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
));
const
long_index_t
b_batch_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
));
const
long_index_t
e_batch_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
));
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
const
long_index_t
a_n_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_n
.
GetAPtrOffset
(
n_idx
));
const
long_index_t
e_n_offset
=
amd_wave_read_first_lane
(
compute_ptr_offset_of_n
.
GetEPtrOffset
(
n_idx
));
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
DsPointer
p_ds_grid_grp
;
...
...
@@ -112,10 +117,10 @@ __global__ void
static_for
<
0
,
NumDTensor
,
1
>
{}(
[
&
](
auto
i
)
{
p_ds_grid_grp
(
i
)
=
p_ds_grid
[
i
]
+
ds_batch_offset
[
i
];
});
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
+
a_batch_offset
,
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
+
a_batch_offset
+
a_n_offset
,
p_b_grid
+
b_batch_offset
,
p_ds_grid_grp
,
p_e_grid
+
e_batch_offset
,
p_e_grid
+
e_batch_offset
+
e_n_offset
,
p_shared
,
a_element_op
,
b_element_op
,
...
...
@@ -130,7 +135,6 @@ __global__ void
ignore
=
p_b_grid
;
ignore
=
p_ds_grid
;
ignore
=
p_e_grid
;
ignore
=
batch_count
;
ignore
=
a_grid_desc_ak0_m_ak1
;
ignore
=
b_grid_desc_bk0_n_bk1
;
ignore
=
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
...
...
@@ -139,6 +143,7 @@ __global__ void
ignore
=
b_element_op
;
ignore
=
cde_element_op
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
compute_ptr_offset_of_n
;
ignore
=
block_2_ctile_map
;
#endif
}
...
...
@@ -233,8 +238,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
transform_conv_to_gemm
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
using
ConvToGemmBwdDataTransform
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
AK1
,
BK1
,
...
...
@@ -242,73 +246,46 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
NPerBlock
,
KPerBlock
,
DoPadGemmM
,
DoPadGemmN
>
{};
DoPadGemmN
,
ALayout
,
BLayout
,
ELayout
,
true
,
/*SplitConvN*/
ABDataType
,
EDataType
>
;
static
auto
GetDummyABDsEGridDescriptor
()
static
auto
GetDummyABDsEGridDescriptor
(
const
ConvToGemmBwdDataTransform
&
conv_to_gemm_transform
)
{
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_lengths
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_strides
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
>
dummy_spatial_lengths
=
{
1
};
const
auto
a_grid_desc_ak0_m_ak1
=
conv_to_gemm_transform
.
MakeADescriptor_AK0_M_AK1
();
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
b_grid_desc_bk0_n_bk1
=
conv_to_gemm_transform
.
MakeBDescriptor_BK0_N_BK1
();
const
auto
ds_grid_desc_m_n
=
generate_tuple
(
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
return
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
using
ConvToGemmBwdDataTransformD
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
AK1
,
BK1
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
DoPadGemmM
,
DoPadGemmN
,
ALayout
,
BLayout
,
DLayout
,
true
,
/*SplitConvN*/
ABDataType
,
DDataType
>
;
return
ConvToGemmBwdDataTransformD
{}.
MakeCDescriptor_M_N
();
},
Number
<
NumDTensor
>
{});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform
.
MakeCDescriptor_M_N
();
return
make_tuple
(
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
...
...
@@ -377,7 +354,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
}
// desc
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
());
constexpr
static
ConvToGemmBwdDataTransform
dummy_conv_to_gemm_transform
;
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
(
dummy_conv_to_gemm_transform
));
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
...
...
@@ -431,15 +409,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_strides_
{
a_g_n_k_wos_strides
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_strides_
{
b_g_k_c_xs_strides
},
ds_g_n_c_wis_lengths_
{
ds_g_n_c_wis_lengths
},
ds_g_n_c_wis_strides_
{
ds_g_n_c_wis_strides
},
e_g_n_c_wis_lengths_
{
e_g_n_c_wis_lengths
},
e_g_n_c_wis_strides_
{
e_g_n_c_wis_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
{
...
...
@@ -450,11 +421,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
p_ds_grid_
(
i
)
=
static_cast
<
const
DDataType
*>
(
p_ds
[
i
]);
});
// A/B/Ds/E Batch Stride
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideB_
=
b_g_k_c_xs_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
0
];
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
compute_ptr_offset_of_batch_
.
BatchStrideDs_
(
i
)
=
ds_g_n_c_wis_strides
[
i
][
0
];
});
...
...
@@ -526,9 +492,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
throw
std
::
runtime_error
(
"wrong! only implemented for 2D and 3D now"
);
}
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
a_g_n_k_wos_lengths
,
ConvToGemmBwdDataTransform
conv_to_gemm_transform_
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
...
...
@@ -538,30 +502,39 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
tildes
};
conv_N_per_block_
=
conv_to_gemm_transform_
.
N_
;
const
auto
a_grid_desc_ak0_m_ak1
=
conv_to_gemm_transform_
.
MakeADescriptor_AK0_M_AK1
();
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
conv_to_gemm_transform_
.
MakeBDescriptor_BK0_N_BK1
();
DsGridDesc_M_N
ds_grid_desc_m_n
;
// populate Ds desc
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
ds_grid_desc_m_n
(
i
)
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
using
ConvToGemmBwdDataTransformD
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
AK1
,
BK1
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
DoPadGemmM
,
DoPadGemmN
,
ALayout
,
BLayout
,
DLayout
,
true
,
/*SplitConvN*/
ABDataType
,
DDataType
>
;
ConvToGemmBwdDataTransformD
conv_to_gemm_transform_d
{
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
...
...
@@ -572,22 +545,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
tildes
};
ds_grid_desc_m_n
(
i
)
=
conv_to_gemm_transform_d
.
MakeCDescriptor_M_N
();
});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
tildes
);
const
auto
e_grid_desc_m_n
=
conv_to_gemm_transform_
.
MakeCDescriptor_M_N
();
// desc for problem definition
const
auto
a_grid_desc_m_k
=
...
...
@@ -628,6 +591,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
}
}
}
// A/B/Ds/E Batch Stride
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideB_
=
b_g_k_c_xs_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
0
];
compute_ptr_offset_of_n_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
1
]
*
conv_N_per_block_
;
compute_ptr_offset_of_n_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
1
]
*
conv_N_per_block_
;
}
void
Print
()
const
...
...
@@ -660,6 +630,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
// tensor descriptor for problem definition
index_t
num_group_
;
index_t
conv_N_per_block_
;
std
::
vector
<
AGridDesc_M_K
>
a_grid_desc_m_k_container_
;
std
::
vector
<
BGridDesc_N_K
>
b_grid_desc_n_k_container_
;
std
::
vector
<
DsGridDesc_M_N
>
ds_grid_desc_m_n_container_
;
...
...
@@ -678,23 +649,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
// for computing batch offset
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
NumDTensor
>
compute_ptr_offset_of_batch_
;
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
I0
>
compute_ptr_offset_of_n_
;
// element-wise op
AElementwiseOp
a_element_op_
;
BElementwiseOp
b_element_op_
;
CDEElementwiseOp
cde_element_op_
;
// for checking IsSupportedArgument()
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_lengths_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
};
...
...
@@ -711,8 +675,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
arg
.
Print
();
}
float
ave_time
=
0
;
const
index_t
gdy
=
arg
.
num_group_
;
const
index_t
num_workgroups_per_Conv_N
=
arg
.
a_g_n_k_wos_lengths_
[
I1
]
/
arg
.
conv_N_per_block_
;
const
index_t
gdz
=
num_workgroups_per_Conv_N
;
float
ave_time
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
arg
.
a_grid_desc_ak0_m_ak1_container_
.
size
();
i
++
)
{
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_container_
[
i
],
...
...
@@ -724,9 +692,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
throw
std
::
runtime_error
(
"wrong! device_op has invalid setting"
);
}
const
index_t
grid_size
=
arg
.
block_2_etile_map_container_
[
i
].
CalculateGridSize
(
arg
.
e_grid_desc_m_n_container_
[
i
])
*
arg
.
num_group_
;
const
index_t
gdx
=
arg
.
block_2_etile_map_container_
[
i
].
CalculateGridSize
(
arg
.
e_grid_desc_m_n_container_
[
i
]);
const
auto
GemmK
=
arg
.
a_grid_desc_m_k_container_
[
i
].
GetLength
(
I1
);
...
...
@@ -747,12 +714,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
DeviceOp
::
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
NumDTensor
>
,
ComputePtrOffsetOfStridedBatch
<
I1
,
I1
,
I0
>
,
has_main_loop
>
;
return
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
g
rid_size
),
dim3
(
g
dx
,
gdy
,
gdz
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
...
...
@@ -762,13 +730,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
cde_element_op_
,
arg
.
a_g_n_k_wos_lengths_
[
0
],
// Group count
arg
.
a_grid_desc_ak0_m_ak1_container_
[
i
],
arg
.
b_grid_desc_bk0_n_bk1_container_
[
i
],
arg
.
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
block_2_etile_map_container_
[
i
],
arg
.
compute_ptr_offset_of_batch_
);
arg
.
compute_ptr_offset_of_batch_
,
arg
.
compute_ptr_offset_of_n_
);
};
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
GemmK
))
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
View file @
c8c016dd
...
...
@@ -111,8 +111,7 @@ __global__ void
[[
maybe_unused
]]
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
[[
maybe_unused
]]
const
index_t
num_k_per_block
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
// offset base pointer for each work-group
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
z
*
NumGroupsToMerge
);
const
index_t
k_idx
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
num_k_per_block
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
View file @
c8c016dd
#pragma once
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -603,7 +603,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
}
hipGetErrorString
(
hipMemcpy
WithStream
(
arg
.
p_workspace_
,
hipMemcpy
Async
(
arg
.
p_workspace_
,
arg
.
gemm_desc_kernel_arg_
.
data
(),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmKernelArg
),
hipMemcpyHostToDevice
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
View file @
c8c016dd
...
...
@@ -18,7 +18,6 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
...
...
@@ -78,7 +77,7 @@ template <typename ALayout,
// TODO: change gridwise_gemm_v2r4r2 to support AK1 & BK1
enable_if_t
<
AK1
==
BK1
,
bool
>
=
false
>
struct
DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
:
public
DeviceGroupedGemm
MultipleD
SplitK
<
ALayout
,
:
public
DeviceGroupedGemmSplitK
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
...
...
@@ -530,7 +529,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
index_t
skipped_group_count_
;
index_t
grid_size_
;
// Pointer to device memory with GEMM kernel arguments.
const
void
*
p_dev_gemm_args_
;
void
*
p_dev_gemm_
k
args_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
...
...
@@ -566,7 +565,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
/// @return The average kernel execution time (if time measurement is enabled.)
///
float
Run
(
const
Argument
&
arg
,
const
void
*
dev_gemm_args
,
void
*
dev_gemm_args
,
void
*
dev_gemm_workspace
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
...
...
@@ -621,7 +620,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
///
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
if
(
arg
.
p_dev_gemm_args_
==
nullptr
)
if
(
arg
.
p_dev_gemm_
k
args_
==
nullptr
)
{
std
::
ostringstream
err
;
err
<<
"The gemm arguments device buffer is not allocated!"
...
...
@@ -637,7 +636,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
throw
std
::
runtime_error
(
err
.
str
());
}
return
Run
(
arg
,
arg
.
p_dev_gemm_args_
,
arg
.
p_workspace_
,
stream_config
);
return
Run
(
arg
,
arg
.
p_dev_gemm_
k
args_
,
arg
.
p_workspace_
,
stream_config
);
}
float
Run
(
const
BaseArgument
*
p_arg
,
...
...
@@ -723,7 +722,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
template
<
bool
HasMainKBlockLoop
>
float
DispatchKernel
(
const
Argument
&
arg
,
const
void
*
dev_gemm_args
,
void
*
dev_gemm_
k
args
,
void
*
dev_gemm_workspace
,
const
StreamConfig
&
stream_config
)
const
{
...
...
@@ -746,7 +745,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
return
LaunchKernel
(
gemm_kernel
,
elementwise_kernel
,
arg
,
dev_gemm_args
,
dev_gemm_
k
args
,
dev_gemm_workspace
,
stream_config
);
}
...
...
@@ -755,12 +754,19 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
float
LaunchKernel
(
const
KernelFunction
&
gemm_kernel
,
const
KernelFunction2
&
elementwise_kernel
,
const
Argument
&
arg
,
const
void
*
dev_gemm_args
,
void
*
dev_gemm_
k
args
,
[[
maybe_unused
]]
void
*
dev_gemm_workspace
,
const
StreamConfig
&
stream_config
)
const
{
float
time
{
0.
f
};
hip_check_error
(
hipMemcpyAsync
(
dev_gemm_kargs
,
arg
.
gemm_kernel_args_
.
data
(),
arg
.
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
),
hipMemcpyHostToDevice
,
stream_config
.
stream_id_
));
auto
preprocess
=
[
&
]()
{
hip_check_error
(
hipMemsetAsync
(
dev_gemm_workspace
,
0
,
arg
.
GetWorkspaceSizeBytes
(),
stream_config
.
stream_id_
));
...
...
@@ -774,7 +780,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
0
,
cast_pointer_to_constant_address_space
(
dev_gemm_args
),
cast_pointer_to_constant_address_space
(
dev_gemm_
k
args
),
arg
.
gemm_kernel_args_
.
size
(),
arg
.
a_element_op_
,
arg
.
b_element_op_
,
...
...
@@ -930,18 +936,30 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
return
str
.
str
();
}
void
SetDeviceKernelArgs
(
Argument
&
arg
,
void
*
p_dev_kernel_args
)
const
void
SetDeviceKernelArgs
(
Base
Argument
*
p_
arg
,
void
*
p_dev_kernel_args
)
const
override
{
arg
.
p_dev_gemm_args_
=
p_dev_kernel_args
;
hip_check_error
(
hipMemcpy
(
p_dev_kernel_args
,
arg
.
gemm_kernel_args_
.
data
(),
GetDeviceKernelArgSize
(
&
arg
),
hipMemcpyHostToDevice
));
auto
arg_ptr
=
dynamic_cast
<
Argument
*>
(
p_arg
);
if
(
arg_ptr
)
{
arg_ptr
->
p_dev_gemm_kargs_
=
p_dev_kernel_args
;
}
else
throw
std
::
runtime_error
(
"The argument pointer is not an object of "
"DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"
);
}
void
S
etDeviceKernelArg
s
(
BaseArgument
*
p_arg
,
void
*
p_dev_kernel_args
)
const
override
size_t
G
etDeviceKernelArg
Size
(
const
BaseArgument
*
p_arg
)
const
override
{
return
SetDeviceKernelArgs
(
*
dynamic_cast
<
Argument
*>
(
p_arg
),
p_dev_kernel_args
);
auto
arg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
arg
)
{
return
arg
->
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
);
}
else
throw
std
::
runtime_error
(
"The argument pointer is not an object of "
"DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"
);
}
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
p_arg
)
const
override
...
...
@@ -974,17 +992,22 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
"DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"
);
}
static
void
SetKBatchSize
(
Argument
&
arg
,
index_t
kbatch
)
{
arg
.
UpdateKBatch
(
kbatch
);
}
void
SetKBatchSize
(
BaseArgument
*
p_arg
,
index_t
kbatch
)
const
override
[[
deprecated
]]
static
void
SetKBatchSize
(
Argument
&
arg
,
index_t
kbatch
)
{
return
SetKBatchSize
(
*
dynamic_cast
<
Argument
*>
(
p_arg
),
kbatch
);
arg
.
UpdateKBatch
(
kbatch
);
}
size_t
GetDeviceKernelArgSize
(
const
BaseArgument
*
p_arg
)
const
override
void
SetKBatchSize
(
BaseArgument
*
p_arg
,
index_t
kbatch
)
const
override
{
return
dynamic_cast
<
const
Argument
*>
(
p_arg
)
->
gemm_kernel_args_
.
size
()
*
sizeof
(
GemmTransKernelArg
);
auto
p_arg_
=
dynamic_cast
<
Argument
*>
(
p_arg
);
if
(
p_arg_
)
{
p_arg_
->
UpdateKBatch
(
kbatch
);
}
else
throw
std
::
runtime_error
(
"The argument pointer is not an object of "
"DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage::Argument structure!"
);
}
};
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment