Commit 8df7bd01 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed comments: replace push_back with emplace_back to avoid copy constructor

parent f4f94f70
...@@ -168,11 +168,11 @@ int main(int argc, char* argv[]) ...@@ -168,11 +168,11 @@ int main(int argc, char* argv[])
for(int i = 0; i < gemm_shapes.size(); i++) for(int i = 0; i < gemm_shapes.size(); i++)
{ {
a_tensors_device.push_back( a_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize())); std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize()));
b_tensors_device.push_back( b_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize())); std::make_unique<DeviceMem>(sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize()));
c_tensors_device.push_back(std::make_unique<DeviceMem>( c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSize())); sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSize()));
a_tensors_device[i]->ToDevice(a_tensors[i].mData.data()); a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
......
...@@ -12,7 +12,6 @@ struct DeviceMem ...@@ -12,7 +12,6 @@ struct DeviceMem
{ {
DeviceMem() = delete; DeviceMem() = delete;
DeviceMem(std::size_t mem_size); DeviceMem(std::size_t mem_size);
DeviceMem(const DeviceMem& p);
void* GetDeviceBuffer(); void* GetDeviceBuffer();
void ToDevice(const void* p); void ToDevice(const void* p);
void FromDevice(void* p); void FromDevice(void* p);
......
...@@ -5,12 +5,6 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) ...@@ -5,12 +5,6 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize)); hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
} }
DeviceMem::DeviceMem(const DeviceMem& p) : mpDeviceBuf(p.mpDeviceBuf), mMemSize(p.mMemSize)
{
// hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
// hipGetErrorString(hipMemcpy(mpDeviceBuf, p.mpDeviceBuf, mMemSize, hipMemcpyDeviceToDevice));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) void DeviceMem::ToDevice(const void* p)
......
...@@ -23,8 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; ...@@ -23,8 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n] // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances = using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances = std::tuple<
std::tuple<
// clang-format off // clang-format off
//#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...@@ -44,7 +43,8 @@ using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances = ...@@ -44,7 +43,8 @@ using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances =
void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances( void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
std::vector<DeviceGroupedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGroupedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances{}); add_device_operation_instances(instances,
device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances{});
} }
} // namespace device_grouped_gemm_instance } // namespace device_grouped_gemm_instance
......
...@@ -23,8 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; ...@@ -23,8 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n] // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances = using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances = std::tuple<
std::tuple<
// clang-format off // clang-format off
//#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //#################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...@@ -44,7 +43,8 @@ using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances = ...@@ -44,7 +43,8 @@ using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances =
void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances( void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
std::vector<DeviceGroupedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGroupedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances{}); add_device_operation_instances(instances,
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances{});
} }
} // namespace device_grouped_gemm_instance } // namespace device_grouped_gemm_instance
......
...@@ -24,8 +24,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpeciali ...@@ -24,8 +24,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpeciali
static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding; static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n] // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = std::tuple<
std::tuple<
// clang-format off // clang-format off
//##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| //##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...@@ -48,8 +47,7 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = ...@@ -48,8 +47,7 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances =
>; >;
// irregular tile size // irregular tile size
using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances = using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
std::tuple<
// clang-format off // clang-format off
//##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| //##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
//##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
...@@ -63,9 +61,10 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances = ...@@ -63,9 +61,10 @@ using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances =
void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances( void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGroupedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGroupedGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances{});
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances{}); device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances{});
add_device_operation_instances(
instances, device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
} }
} // namespace device_grouped_gemm_instance } // namespace device_grouped_gemm_instance
......
...@@ -144,12 +144,12 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -144,12 +144,12 @@ void profile_grouped_gemm_impl(int do_verification,
for(int i = 0; i < group_count; i++) for(int i = 0; i < group_count; i++)
{ {
a_device_buf.push_back( a_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSize())); std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSize()));
b_device_buf.push_back( b_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSize())); std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSize()));
c_device_buf.push_back(std::make_unique<DeviceMem>( c_device_buf.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSize())); sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSize()));
a_device_buf[i]->ToDevice(a_m_k[i].mData.data()); a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment