Unverified Commit 166bb078 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

Reduce CUDA driver calls when choosing transpose kernels (#1002)



Reduce CUDA driver API calls when choosing transpose kernels
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
parent a3df1d73
......@@ -49,7 +49,7 @@ struct KernelConfig {
size_t elements_per_store_t = 0;
KernelConfig(size_t row_length, size_t num_rows, size_t itype_size, size_t otype_size,
size_t load_size_, size_t store_size_)
size_t load_size_, size_t store_size_, size_t sm_count)
: load_size{load_size_}, store_size{store_size_} {
// Check that tiles are correctly aligned
constexpr size_t cache_line_size = 128;
......@@ -69,8 +69,7 @@ struct KernelConfig {
// Parameters for performance model
constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs
active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm),
static_cast<size_t>(cuda::sm_count()));
active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm), sm_count);
elements_per_load = (std::min(cache_line_size, row_tile_elements * itype_size) / itype_size);
elements_per_store_c = (std::min(cache_line_size, row_tile_elements * otype_size) / otype_size);
elements_per_store_t = (std::min(cache_line_size, col_tile_elements * otype_size) / otype_size);
......@@ -273,9 +272,10 @@ void cast_transpose(const Tensor &input, const Tensor &noop, Tensor *cast_output
// Pick kernel config
std::vector<KernelConfig> kernel_configs;
kernel_configs.reserve(16);
const size_t sm_count = static_cast<size_t>(cuda::sm_count());
auto add_config = [&](size_t load_size, size_t store_size) {
kernel_configs.emplace_back(row_length, num_rows, itype_size, otype_size, load_size,
store_size);
store_size, sm_count);
};
add_config(8, 8);
add_config(4, 8);
......
......@@ -54,7 +54,8 @@ struct KernelConfig {
size_t elements_per_store_t = 0; // Elements per L1 cache store to transposed output
KernelConfig(size_t row_length, size_t num_rows, size_t itype_size, size_t itype2_size,
size_t otype_size, size_t load_size_, size_t store_size_, bool is_dact_)
size_t otype_size, size_t load_size_, size_t store_size_, size_t sm_count,
bool is_dact_)
: load_size{load_size_}, store_size{store_size_}, is_dact{is_dact_} {
if (is_dact) {
if (load_size > desired_load_size_dact || store_size > desired_store_size_dact) {
......@@ -85,8 +86,7 @@ struct KernelConfig {
// Parameters for performance model
constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs
active_sm_count = std::min(DIVUP(num_blocks * n_warps_per_tile, warps_per_sm),
static_cast<size_t>(cuda::sm_count()));
active_sm_count = std::min(DIVUP(num_blocks * n_warps_per_tile, warps_per_sm), sm_count);
elements_per_load = (std::min(cache_line_size, tile_size_x * itype_size) / itype_size);
elements_per_load_dact = (std::min(cache_line_size, tile_size_x * itype2_size) / itype2_size);
elements_per_store_c = (std::min(cache_line_size, tile_size_x * otype_size) / otype_size);
......@@ -535,9 +535,10 @@ void cast_transpose_fused(const Tensor &input, const Tensor &act_input, Tensor *
// Pick kernel config
std::vector<KernelConfig> kernel_configs;
kernel_configs.reserve(16);
const size_t sm_count = static_cast<size_t>(cuda::sm_count());
auto add_config = [&](size_t load_size_config, size_t store_size_config) {
kernel_configs.emplace_back(row_length, num_rows, itype_size, itype2_size, otype_size,
load_size_config, store_size_config, IS_DACT);
load_size_config, store_size_config, sm_count, IS_DACT);
};
add_config(8, 8);
add_config(4, 8);
......
......@@ -46,7 +46,7 @@ struct KernelConfig {
size_t elements_per_store = 0;
KernelConfig(size_t row_length, size_t num_rows, size_t type_size, size_t load_size_,
size_t store_size_)
size_t store_size_, size_t sm_count)
: load_size{load_size_}, store_size{store_size_} {
// Check that tiles are correctly aligned
constexpr size_t cache_line_size = 128;
......@@ -66,8 +66,7 @@ struct KernelConfig {
// Parameters for performance model
constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs
active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm),
static_cast<size_t>(cuda::sm_count()));
active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm), sm_count);
elements_per_load = (std::min(cache_line_size, row_tile_elements * type_size) / type_size);
elements_per_store = (std::min(cache_line_size, col_tile_elements * type_size) / type_size);
}
......@@ -231,8 +230,10 @@ void transpose(const Tensor &input, const Tensor &noop, Tensor *output_, cudaStr
// Pick kernel config
std::vector<KernelConfig> kernel_configs;
kernel_configs.reserve(16);
const size_t sm_count = static_cast<size_t>(cuda::sm_count());
auto add_config = [&](size_t load_size, size_t store_size) {
kernel_configs.emplace_back(row_length, num_rows, type_size, load_size, store_size);
kernel_configs.emplace_back(row_length, num_rows, type_size, load_size, store_size,
sm_count);
};
add_config(8, 8);
add_config(4, 8);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment