Unverified Commit 166bb078 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

Reduce CUDA driver calls when choosing transpose kernels (#1002)



Reduce CUDA driver API calls when choosing transpose kernels
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
parent a3df1d73
...@@ -49,7 +49,7 @@ struct KernelConfig { ...@@ -49,7 +49,7 @@ struct KernelConfig {
size_t elements_per_store_t = 0; size_t elements_per_store_t = 0;
KernelConfig(size_t row_length, size_t num_rows, size_t itype_size, size_t otype_size, KernelConfig(size_t row_length, size_t num_rows, size_t itype_size, size_t otype_size,
size_t load_size_, size_t store_size_) size_t load_size_, size_t store_size_, size_t sm_count)
: load_size{load_size_}, store_size{store_size_} { : load_size{load_size_}, store_size{store_size_} {
// Check that tiles are correctly aligned // Check that tiles are correctly aligned
constexpr size_t cache_line_size = 128; constexpr size_t cache_line_size = 128;
...@@ -69,8 +69,7 @@ struct KernelConfig { ...@@ -69,8 +69,7 @@ struct KernelConfig {
// Parameters for performance model // Parameters for performance model
constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs
active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm), active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm), sm_count);
static_cast<size_t>(cuda::sm_count()));
elements_per_load = (std::min(cache_line_size, row_tile_elements * itype_size) / itype_size); elements_per_load = (std::min(cache_line_size, row_tile_elements * itype_size) / itype_size);
elements_per_store_c = (std::min(cache_line_size, row_tile_elements * otype_size) / otype_size); elements_per_store_c = (std::min(cache_line_size, row_tile_elements * otype_size) / otype_size);
elements_per_store_t = (std::min(cache_line_size, col_tile_elements * otype_size) / otype_size); elements_per_store_t = (std::min(cache_line_size, col_tile_elements * otype_size) / otype_size);
...@@ -273,9 +272,10 @@ void cast_transpose(const Tensor &input, const Tensor &noop, Tensor *cast_output ...@@ -273,9 +272,10 @@ void cast_transpose(const Tensor &input, const Tensor &noop, Tensor *cast_output
// Pick kernel config // Pick kernel config
std::vector<KernelConfig> kernel_configs; std::vector<KernelConfig> kernel_configs;
kernel_configs.reserve(16); kernel_configs.reserve(16);
const size_t sm_count = static_cast<size_t>(cuda::sm_count());
auto add_config = [&](size_t load_size, size_t store_size) { auto add_config = [&](size_t load_size, size_t store_size) {
kernel_configs.emplace_back(row_length, num_rows, itype_size, otype_size, load_size, kernel_configs.emplace_back(row_length, num_rows, itype_size, otype_size, load_size,
store_size); store_size, sm_count);
}; };
add_config(8, 8); add_config(8, 8);
add_config(4, 8); add_config(4, 8);
......
...@@ -54,7 +54,8 @@ struct KernelConfig { ...@@ -54,7 +54,8 @@ struct KernelConfig {
size_t elements_per_store_t = 0; // Elements per L1 cache store to transposed output size_t elements_per_store_t = 0; // Elements per L1 cache store to transposed output
KernelConfig(size_t row_length, size_t num_rows, size_t itype_size, size_t itype2_size, KernelConfig(size_t row_length, size_t num_rows, size_t itype_size, size_t itype2_size,
size_t otype_size, size_t load_size_, size_t store_size_, bool is_dact_) size_t otype_size, size_t load_size_, size_t store_size_, size_t sm_count,
bool is_dact_)
: load_size{load_size_}, store_size{store_size_}, is_dact{is_dact_} { : load_size{load_size_}, store_size{store_size_}, is_dact{is_dact_} {
if (is_dact) { if (is_dact) {
if (load_size > desired_load_size_dact || store_size > desired_store_size_dact) { if (load_size > desired_load_size_dact || store_size > desired_store_size_dact) {
...@@ -85,8 +86,7 @@ struct KernelConfig { ...@@ -85,8 +86,7 @@ struct KernelConfig {
// Parameters for performance model // Parameters for performance model
constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs
active_sm_count = std::min(DIVUP(num_blocks * n_warps_per_tile, warps_per_sm), active_sm_count = std::min(DIVUP(num_blocks * n_warps_per_tile, warps_per_sm), sm_count);
static_cast<size_t>(cuda::sm_count()));
elements_per_load = (std::min(cache_line_size, tile_size_x * itype_size) / itype_size); elements_per_load = (std::min(cache_line_size, tile_size_x * itype_size) / itype_size);
elements_per_load_dact = (std::min(cache_line_size, tile_size_x * itype2_size) / itype2_size); elements_per_load_dact = (std::min(cache_line_size, tile_size_x * itype2_size) / itype2_size);
elements_per_store_c = (std::min(cache_line_size, tile_size_x * otype_size) / otype_size); elements_per_store_c = (std::min(cache_line_size, tile_size_x * otype_size) / otype_size);
...@@ -535,9 +535,10 @@ void cast_transpose_fused(const Tensor &input, const Tensor &act_input, Tensor * ...@@ -535,9 +535,10 @@ void cast_transpose_fused(const Tensor &input, const Tensor &act_input, Tensor *
// Pick kernel config // Pick kernel config
std::vector<KernelConfig> kernel_configs; std::vector<KernelConfig> kernel_configs;
kernel_configs.reserve(16); kernel_configs.reserve(16);
const size_t sm_count = static_cast<size_t>(cuda::sm_count());
auto add_config = [&](size_t load_size_config, size_t store_size_config) { auto add_config = [&](size_t load_size_config, size_t store_size_config) {
kernel_configs.emplace_back(row_length, num_rows, itype_size, itype2_size, otype_size, kernel_configs.emplace_back(row_length, num_rows, itype_size, itype2_size, otype_size,
load_size_config, store_size_config, IS_DACT); load_size_config, store_size_config, sm_count, IS_DACT);
}; };
add_config(8, 8); add_config(8, 8);
add_config(4, 8); add_config(4, 8);
......
...@@ -46,7 +46,7 @@ struct KernelConfig { ...@@ -46,7 +46,7 @@ struct KernelConfig {
size_t elements_per_store = 0; size_t elements_per_store = 0;
KernelConfig(size_t row_length, size_t num_rows, size_t type_size, size_t load_size_, KernelConfig(size_t row_length, size_t num_rows, size_t type_size, size_t load_size_,
size_t store_size_) size_t store_size_, size_t sm_count)
: load_size{load_size_}, store_size{store_size_} { : load_size{load_size_}, store_size{store_size_} {
// Check that tiles are correctly aligned // Check that tiles are correctly aligned
constexpr size_t cache_line_size = 128; constexpr size_t cache_line_size = 128;
...@@ -66,8 +66,7 @@ struct KernelConfig { ...@@ -66,8 +66,7 @@ struct KernelConfig {
// Parameters for performance model // Parameters for performance model
constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs constexpr size_t warps_per_sm = 16; // Rough estimate for saturated SMs
active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm), active_sm_count = std::min(DIVUP(num_blocks * warps_per_tile, warps_per_sm), sm_count);
static_cast<size_t>(cuda::sm_count()));
elements_per_load = (std::min(cache_line_size, row_tile_elements * type_size) / type_size); elements_per_load = (std::min(cache_line_size, row_tile_elements * type_size) / type_size);
elements_per_store = (std::min(cache_line_size, col_tile_elements * type_size) / type_size); elements_per_store = (std::min(cache_line_size, col_tile_elements * type_size) / type_size);
} }
...@@ -231,8 +230,10 @@ void transpose(const Tensor &input, const Tensor &noop, Tensor *output_, cudaStr ...@@ -231,8 +230,10 @@ void transpose(const Tensor &input, const Tensor &noop, Tensor *output_, cudaStr
// Pick kernel config // Pick kernel config
std::vector<KernelConfig> kernel_configs; std::vector<KernelConfig> kernel_configs;
kernel_configs.reserve(16); kernel_configs.reserve(16);
const size_t sm_count = static_cast<size_t>(cuda::sm_count());
auto add_config = [&](size_t load_size, size_t store_size) { auto add_config = [&](size_t load_size, size_t store_size) {
kernel_configs.emplace_back(row_length, num_rows, type_size, load_size, store_size); kernel_configs.emplace_back(row_length, num_rows, type_size, load_size, store_size,
sm_count);
}; };
add_config(8, 8); add_config(8, 8);
add_config(4, 8); add_config(4, 8);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment