Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8df7bd01
Commit
8df7bd01
authored
Mar 16, 2022
by
Jing Zhang
Browse files
fixed comments: replace push_back with emplace_back to avoid copy constructor
parent
f4f94f70
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
29 additions
and
37 deletions
+29
-37
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+3
-3
library/include/ck/library/host_tensor/device.hpp
library/include/ck/library/host_tensor/device.hpp
+0
-1
library/src/host_tensor/device.cpp
library/src/host_tensor/device.cpp
+0
-6
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+6
-6
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+6
-6
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
...device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+11
-12
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+3
-3
No files found.
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
8df7bd01
...
...
@@ -168,11 +168,11 @@ int main(int argc, char* argv[])
for
(
int
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
a_tensors_device
.
push
_back
(
a_tensors_device
.
emplace
_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()));
b_tensors_device
.
push
_back
(
b_tensors_device
.
emplace
_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSize
()));
c_tensors_device
.
push
_back
(
std
::
make_unique
<
DeviceMem
>
(
c_tensors_device
.
emplace
_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSize
()));
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
].
mData
.
data
());
...
...
library/include/ck/library/host_tensor/device.hpp
View file @
8df7bd01
...
...
@@ -12,7 +12,6 @@ struct DeviceMem
{
DeviceMem
()
=
delete
;
DeviceMem
(
std
::
size_t
mem_size
);
DeviceMem
(
const
DeviceMem
&
p
);
void
*
GetDeviceBuffer
();
void
ToDevice
(
const
void
*
p
);
void
FromDevice
(
void
*
p
);
...
...
library/src/host_tensor/device.cpp
View file @
8df7bd01
...
...
@@ -5,12 +5,6 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
hipGetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
DeviceMem
::
DeviceMem
(
const
DeviceMem
&
p
)
:
mpDeviceBuf
(
p
.
mpDeviceBuf
),
mMemSize
(
p
.
mMemSize
)
{
// hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
// hipGetErrorString(hipMemcpy(mpDeviceBuf, p.mpDeviceBuf, mMemSize, hipMemcpyDeviceToDevice));
}
void
*
DeviceMem
::
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
...
...
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
View file @
8df7bd01
...
...
@@ -23,8 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization_t
::
Default
;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
=
std
::
tuple
<
using
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
=
std
::
tuple
<
// clang-format off
//#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...
...
@@ -44,7 +43,8 @@ using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances =
void
add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGroupedGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances
{});
}
}
// namespace device_grouped_gemm_instance
...
...
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
View file @
8df7bd01
...
...
@@ -23,8 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization_t
::
Default
;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
using
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
=
std
::
tuple
<
using
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
=
std
::
tuple
<
// clang-format off
//#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...
...
@@ -44,7 +43,8 @@ using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances =
void
add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGroupedGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
{});
}
}
// namespace device_grouped_gemm_instance
...
...
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
View file @
8df7bd01
...
...
@@ -24,8 +24,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpeciali
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization_t
::
MNPadding
;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
=
std
::
tuple
<
using
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
=
std
::
tuple
<
// clang-format off
//##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...
...
@@ -48,8 +47,7 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances =
>
;
// irregular tile size
using
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances
=
std
::
tuple
<
using
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances
=
std
::
tuple
<
// clang-format off
//##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...
...
@@ -63,9 +61,10 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances =
void
add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGroupedGemmPtr
<
PassThrough
,
PassThrough
,
PassThrough
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances
{});
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances
{});
}
}
// namespace device_grouped_gemm_instance
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
8df7bd01
...
...
@@ -144,12 +144,12 @@ void profile_grouped_gemm_impl(int do_verification,
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
{
a_device_buf
.
push
_back
(
a_device_buf
.
emplace
_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSize
()));
b_device_buf
.
push
_back
(
b_device_buf
.
emplace
_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSize
()));
c_device_buf
.
push
_back
(
std
::
make_unique
<
DeviceMem
>
(
c_device_buf
.
emplace
_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSize
()));
a_device_buf
[
i
]
->
ToDevice
(
a_m_k
[
i
].
mData
.
data
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment