"...composable_kernel-1.git" did not exist on "b9b9c3b8147572516e239c3c360a8d9f67d32dee"
Unverified Commit c5f6ec84 authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Using number of compute units to set gridSize (#754)

* Add getAvailableComputeUnitCount() interface

* Use available number of compute units to set kernel grid size
parent d1838d32
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
static int getAvailableComputeUnitCount(const StreamConfig& stream_config)
{
constexpr int MAX_MASK_DWORDS = 64;
// assume at most 64*32 = 2048 CUs
uint32_t cuMask[MAX_MASK_DWORDS];
for(int i = 0; i < MAX_MASK_DWORDS; i++)
cuMask[i] = 0;
auto countSetBits = [](uint32_t dword) {
int count = 0;
while(dword != 0)
{
if(dword & 0x1)
count++;
dword = dword >> 1;
};
return (count);
};
hip_check_error(hipExtStreamGetCUMask(stream_config.stream_id_, MAX_MASK_DWORDS, &cuMask[0]));
int ret = 0;
for(int i = 0; i < MAX_MASK_DWORDS; i++)
ret += countSetBits(cuMask[i]);
return (ret);
};
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -171,10 +172,7 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -171,10 +172,7 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
inStridesArray_(inStridesArray), inStridesArray_(inStridesArray),
outStridesArray_(outStridesArray), outStridesArray_(outStridesArray),
elementwise_op_(elementwise_op), elementwise_op_(elementwise_op),
blockSize_(256), blockSize_(256)
gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
num_threads_m_((gridSize_ * blockSize_) / 16),
num_threads_n_(16)
{ {
static_assert(NumDim_m > 0, ""); static_assert(NumDim_m > 0, "");
static_assert(NumDim_n > 0, ""); static_assert(NumDim_n > 0, "");
...@@ -192,34 +190,10 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -192,34 +190,10 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
return static_cast<DataType*>(out_dev_buffers[I.value]); return static_cast<DataType*>(out_dev_buffers[I.value]);
}, },
Number<NumOutput>{}); Number<NumOutput>{});
in_grid_2d_desc_tuple_ = generate_tuple(
[&](auto I) {
return MakeDescriptor_MN(lengths,
inStridesArray[I.value],
gridSize_,
blockSize_,
num_threads_m_,
num_threads_n_);
},
Number<NumInput>{});
out_grid_2d_desc_tuple_ = generate_tuple(
[&](auto I) {
return MakeDescriptor_MN(lengths,
outStridesArray[I.value],
gridSize_,
blockSize_,
num_threads_m_,
num_threads_n_);
},
Number<NumOutput>{});
} }
InDataTypePointerTuple in_dev_buffers_; InDataTypePointerTuple in_dev_buffers_;
OutDataTypePointerTuple out_dev_buffers_; OutDataTypePointerTuple out_dev_buffers_;
InGrid2dDescTuple in_grid_2d_desc_tuple_;
OutGrid2dDescTuple out_grid_2d_desc_tuple_;
std::array<index_t, NumDim> lengths_; std::array<index_t, NumDim> lengths_;
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_; std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
...@@ -227,15 +201,38 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -227,15 +201,38 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
ElementwiseOperation elementwise_op_; ElementwiseOperation elementwise_op_;
index_t blockSize_; index_t blockSize_;
index_t gridSize_;
index_t num_threads_m_;
index_t num_threads_n_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
{ {
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
index_t gridSize = getAvailableComputeUnitCount(stream_config);
index_t num_threads_m = (gridSize * arg.blockSize_) / 16;
index_t num_threads_n = 16;
auto in_grid_2d_desc_tuple = generate_tuple(
[&](auto I) {
return MakeDescriptor_MN(arg.lengths_,
arg.inStridesArray_[I.value],
gridSize,
arg.blockSize_,
num_threads_m,
num_threads_n);
},
Number<NumInput>{});
auto out_grid_2d_desc_tuple = generate_tuple(
[&](auto I) {
return MakeDescriptor_MN(arg.lengths_,
arg.outStridesArray_[I.value],
gridSize,
arg.blockSize_,
num_threads_m,
num_threads_n);
},
Number<NumOutput>{});
const auto kernel = kernel_elementwise_2d<GridwiseElementwise, const auto kernel = kernel_elementwise_2d<GridwiseElementwise,
InGrid2dDescTuple, InGrid2dDescTuple,
OutGrid2dDescTuple, OutGrid2dDescTuple,
...@@ -245,16 +242,16 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -245,16 +242,16 @@ struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(arg.gridSize_), dim3(gridSize),
dim3(arg.blockSize_), dim3(arg.blockSize_),
0, 0,
arg.in_grid_2d_desc_tuple_, in_grid_2d_desc_tuple,
arg.out_grid_2d_desc_tuple_, out_grid_2d_desc_tuple,
arg.in_dev_buffers_, arg.in_dev_buffers_,
arg.out_dev_buffers_, arg.out_dev_buffers_,
arg.elementwise_op_, arg.elementwise_op_,
arg.num_threads_m_, num_threads_m,
arg.num_threads_n_); num_threads_n);
return elapsed_time; return elapsed_time;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -144,8 +145,7 @@ struct DeviceElementwiseImpl ...@@ -144,8 +145,7 @@ struct DeviceElementwiseImpl
inStridesArray_(inStridesArray), inStridesArray_(inStridesArray),
outStridesArray_(outStridesArray), outStridesArray_(outStridesArray),
elementwise_op_(elementwise_op), elementwise_op_(elementwise_op),
blockSize_(256), blockSize_(256)
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
{ {
in_dev_buffers_ = generate_tuple( in_dev_buffers_ = generate_tuple(
[&](auto I) { [&](auto I) {
...@@ -160,26 +160,10 @@ struct DeviceElementwiseImpl ...@@ -160,26 +160,10 @@ struct DeviceElementwiseImpl
return static_cast<DataType*>(out_dev_buffers[I.value]); return static_cast<DataType*>(out_dev_buffers[I.value]);
}, },
Number<NumOutput>{}); Number<NumOutput>{});
in_grid_1d_desc_tuple_ = generate_tuple(
[&](auto I) {
return MakeDescriptor_M(
lengths, inStridesArray[I.value], gridSize_, blockSize_);
},
Number<NumInput>{});
out_grid_1d_desc_tuple_ = generate_tuple(
[&](auto I) {
return MakeDescriptor_M(
lengths, outStridesArray[I.value], gridSize_, blockSize_);
},
Number<NumOutput>{});
} }
InDataTypePointerTuple in_dev_buffers_; InDataTypePointerTuple in_dev_buffers_;
OutDataTypePointerTuple out_dev_buffers_; OutDataTypePointerTuple out_dev_buffers_;
InGrid1dDescTuple in_grid_1d_desc_tuple_;
OutGrid1dDescTuple out_grid_1d_desc_tuple_;
std::array<index_t, NumDim> lengths_; std::array<index_t, NumDim> lengths_;
std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_; std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
...@@ -187,13 +171,28 @@ struct DeviceElementwiseImpl ...@@ -187,13 +171,28 @@ struct DeviceElementwiseImpl
ElementwiseOperation elementwise_op_; ElementwiseOperation elementwise_op_;
index_t blockSize_; index_t blockSize_;
index_t gridSize_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
{ {
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{ {
index_t gridSize = getAvailableComputeUnitCount(stream_config);
auto in_grid_1d_desc_tuple = generate_tuple(
[&](auto I) {
return MakeDescriptor_M(
arg.lengths_, arg.inStridesArray_[I.value], gridSize, arg.blockSize_);
},
Number<NumInput>{});
auto out_grid_1d_desc_tuple = generate_tuple(
[&](auto I) {
return MakeDescriptor_M(
arg.lengths_, arg.outStridesArray_[I.value], gridSize, arg.blockSize_);
},
Number<NumOutput>{});
const auto kernel = kernel_elementwise_1d<GridwiseElementwise, const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
InGrid1dDescTuple, InGrid1dDescTuple,
OutGrid1dDescTuple, OutGrid1dDescTuple,
...@@ -203,11 +202,11 @@ struct DeviceElementwiseImpl ...@@ -203,11 +202,11 @@ struct DeviceElementwiseImpl
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(arg.gridSize_), dim3(gridSize),
dim3(arg.blockSize_), dim3(arg.blockSize_),
0, 0,
arg.in_grid_1d_desc_tuple_, in_grid_1d_desc_tuple,
arg.out_grid_1d_desc_tuple_, out_grid_1d_desc_tuple,
arg.in_dev_buffers_, arg.in_dev_buffers_,
arg.out_dev_buffers_, arg.out_dev_buffers_,
arg.elementwise_op_); arg.elementwise_op_);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment