Commit e57bd240 authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into add_fp16_wmma_conv_instance
parents 4b70d68e 37a8c1f7
...@@ -71,6 +71,6 @@ TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes); ...@@ -71,6 +71,6 @@ TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
#ifdef __fp16 #ifdef __fp16
TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); } TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
#endif #endif
#ifdef __int8__ #ifdef CK_ENABLE_INT8
TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); } TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
#endif #endif
...@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility) add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
set(target 1) target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
set(target 1)
endif()
endif() endif()
endforeach() endforeach()
...@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(test_batched_gemm_softmax_gemm) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp) add_custom_target(test_batched_gemm_softmax_gemm)
target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance) add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16) target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
set(target 1) add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
set(target 1)
endif()
endif() endif()
endforeach() endforeach()
\ No newline at end of file
...@@ -2,21 +2,25 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,21 +2,25 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(test_batched_gemm_softmax_gemm_permute) if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_custom_target(test_batched_gemm_softmax_gemm_permute)
add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp) endif()
add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16) target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16) target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp) add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp) endif()
target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16) add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16) target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
endif()
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
\ No newline at end of file
#include <stdio.h>
#include <string>
#include <algorithm>
#include <vector>
#include <limits>
#include "simple_args.h"
simple_args_t create_arg(int argc, char** argv)
{
simple_args_t args;
args.insert("m", "1024", "matrix m")
.insert("n", "1024", "matrix n")
.insert("k", "1024", "matrix k")
.insert("m_per_block", "128", "m_per_block")
.insert("n_per_block", "128", "n_per_block")
.insert("k_per_block", "32", "k_per_block")
.insert("num_cu", "104", "num cu")
.insert("occupancy", "2", "occupancy")
.parse(argc, argv);
return args;
}
namespace impl {
template <typename T>
T integer_divide_ceil(T n, T d)
{
return (n + d - 1) / d;
}
template <typename T>
T min(T a, T b)
{
return a > b ? b : a;
}
template <typename T>
T max(T a, T b)
{
return a > b ? a : b;
}
} // namespace impl
struct block_dispatcher_t
{
public:
uint32_t m_per_block;
uint32_t n_per_block;
uint32_t k_per_block;
uint32_t num_cu;
uint32_t occupancy;
uint32_t m;
uint32_t n;
uint32_t k;
//--------------------------------------
uint32_t sk_num_blocks;
uint32_t sk_num_big_blocks;
uint32_t sk_total_iters;
// uint32_t sk_num_blocks_per_tile; // how many
uint32_t dp_start_block_idx;
uint32_t dp_iters_per_block;
uint32_t dp_num_blocks;
uint32_t k_iters_per_tile;
uint32_t k_iters_per_big_block;
//--------------------------------------
static constexpr uint32_t min_k_iters_per_sk_block = 1;
void dump()
{
printf("%dx%dx%d(%dx%dx%d), cu:%d, occ:%d, grids:%d, sk_num_big_blocks:%d, "
"sk_num_blocks:%d, sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, "
"dp_num_blocks:%d, k_iters_per_tile:%d, k_iters_per_big_block:%d\n",
m,
n,
k,
m_per_block,
n_per_block,
k_per_block,
num_cu,
occupancy,
get_grid_dims_x(),
sk_num_big_blocks,
sk_num_blocks,
sk_total_iters,
dp_start_block_idx,
dp_iters_per_block,
dp_num_blocks,
k_iters_per_tile,
k_iters_per_big_block);
}
block_dispatcher_t(uint32_t m_per_block_,
uint32_t n_per_block_,
uint32_t k_per_block_,
uint32_t num_cu_,
uint32_t occupancy_,
uint32_t m_,
uint32_t n_,
uint32_t k_)
: m_per_block(m_per_block_),
n_per_block(n_per_block_),
k_per_block(k_per_block_),
num_cu(num_cu_),
occupancy(occupancy_),
m(m_),
n(n_),
k(k_)
{
init();
}
uint32_t get_grid_dims_x() { return dp_start_block_idx + dp_num_blocks; }
uint32_t get_block_idx(uint32_t bid)
{
// block id is linearily allocated along sk blocks (dp blocks are fine)
// this function will compute blockIdx.x and the linear sk block mapping
// uint32_t block_idx = 0;
// if(bid < sk_num_big_blocks) {
// uint32_t current_k_iter = bid * k_iters_per_big_block;
// tile_idx = current_k_iter / k_iters_per_tile;
// }
return bid;
}
uint32_t get_current_itr(uint32_t block_idx)
{
uint32_t current_itr = 0;
if(block_idx < sk_num_big_blocks)
{
current_itr = block_idx * k_iters_per_big_block;
}
else if(block_idx < sk_num_blocks)
{
current_itr = (sk_num_big_blocks * k_iters_per_big_block) +
(block_idx - sk_num_big_blocks) * (k_iters_per_big_block - 1);
}
else if(block_idx >= dp_start_block_idx)
{
current_itr = sk_total_iters + (block_idx - dp_start_block_idx) * dp_iters_per_block;
}
return current_itr;
}
void get_block_itr(uint32_t block_idx, uint32_t& iter_start, uint32_t& iter_end)
{
if(block_idx < sk_num_big_blocks)
{
iter_start = block_idx * k_iters_per_big_block;
iter_end = iter_start + k_iters_per_big_block;
}
else if(block_idx < sk_num_blocks)
{
iter_start = (sk_num_big_blocks * k_iters_per_big_block) +
(block_idx - sk_num_big_blocks) * (k_iters_per_big_block - 1);
iter_end = iter_start + (k_iters_per_big_block - 1);
}
else if(block_idx >= dp_start_block_idx)
{
iter_start = sk_total_iters + (block_idx - dp_start_block_idx) * dp_iters_per_block;
iter_end = iter_start + dp_iters_per_block;
}
}
private:
void init()
{
uint32_t num_tiles =
impl::integer_divide_ceil(m, m_per_block) * impl::integer_divide_ceil(n, n_per_block);
k_iters_per_tile = impl::integer_divide_ceil(k, k_per_block);
// one cu can hold one wg at one time, from the whole chip's point of view
// if number of wg is same as num_cu, we call it 1 dispatch
// if number of wg is 2x num_cu, we call it 2 dispatches.
// one dispatch can deliever wg same as num_cu (full dispatch), or less than num_cu (partial
// dispatch)
//
uint32_t full_dispatches = num_tiles / num_cu;
uint32_t full_dispatch_tiles = full_dispatches * num_cu;
uint32_t partial_dispatche_tiles = num_tiles - full_dispatch_tiles;
uint32_t sk_occupancy = occupancy;
uint32_t dp_tiles = full_dispatch_tiles;
uint32_t sk_tiles = partial_dispatche_tiles;
if(full_dispatches < occupancy)
{
// in this case, we allocate all blocks as sk blocks
// sk_occupancy = occupancy - full_dispatches;
sk_occupancy = 1; // TODO: single occ seems better
dp_tiles = full_dispatch_tiles;
sk_tiles = partial_dispatche_tiles;
}
else if((occupancy > 1) && (full_dispatches % occupancy == occupancy - 1))
{
// e.g. occupancy = 2, full_dispatches = 3, 5, 7 ...
// occupancy = 3, full_dispatches = 5, 8, 11 ...
// occupancy = 4, full_dispatches = 7, 11 ...
sk_occupancy = 1; // left 1 slot for sk occupancy
dp_tiles = full_dispatch_tiles;
sk_tiles = partial_dispatche_tiles;
}
else
{
// others, we reduce 1 dispatch from dp, together with partial dispatch,
// to construct sk dispatch
sk_occupancy = occupancy - ((full_dispatches - 1) % occupancy);
dp_tiles = full_dispatch_tiles - num_cu;
sk_tiles = partial_dispatche_tiles + num_cu;
}
// dp_num_blocks = dp_tiles;
// dp_start_block_idx = num_cu * sk_occupancy;
dp_iters_per_block = k_iters_per_tile;
sk_total_iters = k_iters_per_tile * sk_tiles;
// printf("num_tiles:%d, full_dispatches:%d, full_dispatch_tiles:%d,
// partial_dispatche_tiles:%d\n",
// num_tiles, full_dispatches, full_dispatch_tiles, partial_dispatche_tiles);
{
uint32_t min_sk_tiles = (sk_tiles >= num_cu) ? num_cu : (sk_tiles + 1);
uint32_t max_sk_tiles =
(sk_tiles >= num_cu) ? num_cu * sk_occupancy
: impl::min(num_cu, sk_total_iters / min_k_iters_per_sk_block);
// if use dp for sk-block, how many iters do we need
uint32_t dp_for_sk_iters = k_iters_per_tile;
uint32_t best_sk_score =
std::numeric_limits<int>::max(); // we need to find the smallest sk iters
for(uint32_t tentative_sk_blocks = min_sk_tiles; tentative_sk_blocks < max_sk_tiles;
tentative_sk_blocks++)
{
uint32_t tentative_sk_iters_per_block =
(sk_total_iters + tentative_sk_blocks - 1) / tentative_sk_blocks;
uint32_t tentative_sk_iters = tentative_sk_iters_per_block;
uint32_t sk_blocks_per_tile = (tentative_sk_blocks + sk_tiles - 1) / sk_tiles;
// TODO: carefully adjust this parameter
// the more sk_blocks_per_tile, the worse the overhead
uint32_t cross_sk_blocks_overhead = sk_blocks_per_tile;
if(tentative_sk_blocks % sk_tiles != 0)
{
// penalty for uneven divide
cross_sk_blocks_overhead +=
sk_blocks_per_tile * tentative_sk_iters_per_block / 50;
}
uint32_t tentative_sk_score = tentative_sk_iters + cross_sk_blocks_overhead;
if(tentative_sk_score < best_sk_score)
{
best_sk_score = tentative_sk_score;
sk_num_blocks = tentative_sk_blocks;
}
}
if(best_sk_score >= dp_for_sk_iters)
{
sk_num_blocks = 0;
}
if(sk_num_blocks == 0)
{
sk_num_big_blocks = 0;
k_iters_per_big_block = 0;
dp_num_blocks = num_tiles; // all tile to be dp block
dp_start_block_idx = 0;
sk_total_iters = 0; // clear this tiles
}
else
{
uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
sk_num_big_blocks = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
k_iters_per_big_block = k_iters_per_sk_block + 1;
dp_num_blocks = dp_tiles;
dp_start_block_idx = (sk_num_blocks + num_cu - 1) / num_cu * num_cu;
}
}
}
};
struct tile_work_t
{
uint32_t tile_idx;
uint32_t iter_begin;
uint32_t k_begin;
uint32_t k_end;
uint32_t k_iters_remaining;
};
int main(int argc, char** argv)
{
simple_args_t arg = create_arg(argc, argv);
block_dispatcher_t block_dispatcher{arg.get_uint32("m_per_block"),
arg.get_uint32("n_per_block"),
arg.get_uint32("k_per_block"),
arg.get_uint32("num_cu"),
arg.get_uint32("occupancy"),
arg.get_uint32("m"),
arg.get_uint32("n"),
arg.get_uint32("k")};
block_dispatcher.dump();
// simulate actual kernel launch
uint32_t dim_x = block_dispatcher.get_grid_dims_x();
uint32_t total_k_iters =
impl::integer_divide_ceil(arg.get_uint32("k"), arg.get_uint32("k_per_block"));
uint32_t num_tiles =
impl::integer_divide_ceil(arg.get_uint32("m"), arg.get_uint32("m_per_block")) *
impl::integer_divide_ceil(arg.get_uint32("n"), arg.get_uint32("n_per_block"));
std::vector<int> valid_tile_record(num_tiles * total_k_iters);
for(uint32_t bid = 0; bid < dim_x; bid++)
{
uint32_t block_idx = block_dispatcher.get_block_idx(bid);
bool is_sk_block = block_idx < (block_dispatcher.sk_num_blocks);
bool is_dp_block = block_idx >= block_dispatcher.dp_start_block_idx;
uint32_t iter_start, iter_end;
block_dispatcher.get_block_itr(block_idx, iter_start, iter_end);
uint32_t total_iter_length = iter_end - iter_start;
while(true)
{
uint32_t iter_length_mod = iter_end % block_dispatcher.k_iters_per_tile;
uint32_t current_iter_length =
impl::min(iter_length_mod == 0 ? (iter_end - iter_start) : iter_length_mod,
total_iter_length);
uint32_t tile_idx = (iter_end - 1) / block_dispatcher.k_iters_per_tile;
uint32_t tile_iter_start =
((iter_end - 1) % block_dispatcher.k_iters_per_tile) - current_iter_length + 1;
if(is_sk_block)
{
printf("[sk_block] bid:%3d, block_idx:%3d, tile_idx:%3d, iter_start:%d(%d | %d), "
"iter_end:%d (len:%d)\n",
bid,
block_idx,
tile_idx,
iter_end - current_iter_length,
tile_iter_start,
iter_start,
iter_end,
current_iter_length);
}
else if(is_dp_block)
{
printf("[dp_block] bid:%3d, block_idx:%3d, tile_idx:%3d, iter_start:%d(%d | %d), "
"iter_end:%d (len:%d)\n",
bid,
block_idx,
tile_idx,
iter_end - current_iter_length,
tile_iter_start,
iter_start,
iter_end,
current_iter_length);
}
else
{
printf("[other ] bid:%3d, block_idx:%3d\n", bid, block_idx);
}
// some validation check
for(auto i = iter_end - current_iter_length; i < iter_end; i++)
{
if(i >= valid_tile_record.size())
{
printf("unexpected, current iter:%d larger than max:%d\n",
i,
valid_tile_record.size());
return -1;
}
valid_tile_record[i] = 1;
}
iter_end -= current_iter_length;
if(iter_end <= iter_start)
break;
}
}
int untouched = 0;
for(auto i = 0; i < valid_tile_record.size(); i++)
{
if(valid_tile_record[i] != 1)
{
printf("untouched at %d (%d)\n", i, valid_tile_record.size());
untouched++;
}
}
printf("untouched %d/%d, %s\n",
untouched,
valid_tile_record.size(),
untouched == 0 ? "valid" : "fail");
}
CC=g++
$CC -Wall -std=c++17 -Iinclude -O3 block_swizzle_test.cpp -o block_swizzle_test.exe
\ No newline at end of file
#pragma once
#include <iomanip>
#include <iostream>
#include <stdlib.h>
#include <string>
#include <unordered_map>
#include <vector>
#include <assert.h>
struct arg_content_t
{
std::string name; // key
std::string value;
std::string help_text;
};
class simple_args_t
{
public:
simple_args_t() {}
simple_args_t& insert(const std::string& name_,
const std::string& default_value_,
const std::string& help_text_)
{
arg_content_t arg{name_, default_value_, help_text_};
if(arg_map.count(arg.name) != 0)
{
std::cout << "arg:" << arg.name << "already exist" << std::endl;
}
else
{
arg_map[arg.name] = arg;
}
return *this;
}
void usage()
{
for(auto& content : arg_map)
{
std::vector<std::string> help_text_lines;
size_t pos = 0;
for(size_t next_pos = content.second.help_text.find('\n', pos);
next_pos != std::string::npos;)
{
help_text_lines.push_back(
std::string(content.second.help_text.begin() + pos,
content.second.help_text.begin() + next_pos++));
pos = next_pos;
next_pos = content.second.help_text.find('\n', pos);
}
help_text_lines.push_back(std::string(content.second.help_text.begin() + pos,
content.second.help_text.end()));
int arg_name_width = 16 - content.second.name.length();
arg_name_width = arg_name_width > 0 ? arg_name_width : 2;
std::cout << std::setw(4) << "-" << content.second.name << std::setw(arg_name_width)
<< " " << help_text_lines[0] << std::endl;
for(auto help_next_line = std::next(help_text_lines.begin());
help_next_line != help_text_lines.end();
++help_next_line)
{
std::cout << std::setw(28) << " " << *help_next_line << std::endl;
}
}
}
bool parse(int argc, char* argv[], int start_index = 1)
{
if(argc <= start_index)
{
// std::cout << "not enough args (" << argc << ") with starting index " << start_index
// << std::endl;
return true;
}
for(int i = start_index; i < argc; i++)
{
std::string cur_arg = std::string(argv[i]);
if(cur_arg[0] != '-')
{
std::cout << "illegal input" << std::endl;
usage();
return false;
}
else if(cur_arg[0] == '-' && cur_arg[1] == '?')
{
usage();
return false;
}
else
{
size_t found_equal = cur_arg.find('=');
if(found_equal == std::string::npos || found_equal == (cur_arg.length() - 1))
{
std::cout << "failed while parsing \"" << cur_arg << "\", "
<< "arg must be in the form \"-name=value\"" << std::endl;
return false;
}
std::string arg_name = cur_arg.substr(1, found_equal - 1);
std::string arg_value = cur_arg.substr(found_equal + 1);
if(arg_map.count(arg_name) == 0)
{
std::cout << "no such arg \"" << arg_name << "\" registered" << std::endl;
return false;
}
arg_map[arg_name].value = arg_value;
}
}
return true;
}
std::string get(const std::string& name) const { return get_str(name); }
std::string get_str(const std::string& name) const
{
assert(arg_map.count(name) != 0);
std::string value = arg_map.at(name).value;
return value;
}
int get_int(const std::string& name) const
{
assert(arg_map.count(name) != 0);
int value = atoi(arg_map.at(name).value.c_str());
return value;
}
uint32_t get_uint32(const std::string& name) const
{
assert(arg_map.count(name) != 0);
uint32_t value = strtoul(arg_map.at(name).value.c_str(), nullptr, 10);
return value;
}
uint64_t get_uint64(const std::string& name) const
{
assert(arg_map.count(name) != 0);
uint64_t value = strtoull(arg_map.at(name).value.c_str(), nullptr, 10);
return value;
}
double get_double(const std::string& name) const
{
assert(arg_map.count(name) != 0);
double value = atof(arg_map.at(name).value.c_str());
return value;
}
float get_float(const std::string& name) const
{
assert(arg_map.count(name) != 0);
float value = atof(arg_map.at(name).value.c_str());
return value;
}
private:
std::unordered_map<std::string, arg_content_t> arg_map;
};
...@@ -38,7 +38,7 @@ class ContractionInstanceWrapper ...@@ -38,7 +38,7 @@ class ContractionInstanceWrapper
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle< NumDim, NumDim, NumDim, F32, F32, F32, F32, ck::Tuple<F32>, F32, Pass, Pass, Bilinear, GemmSpec, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>; DeviceContractionMultipleD_Xdl_CShuffle< NumDim, NumDim, NumDim, F32, F32, F32, F32, ck::Tuple<F32>, F32, Pass, Pass, Bilinear, GemmSpec, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, ABlockTransferSrcVectorDim, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, BBlockTransferSrcVectorDim, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, CDEBlockTransferScalarPerVector>;
// clang-format on // clang-format on
bool isSupported(std::vector<ck::index_t>& ADims, bool isSupported(std::vector<ck::index_t>& ADims,
......
add_custom_target(test_elementwise_normalization) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_custom_target(test_elementwise_normalization)
add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance) add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
endif()
add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16) \ No newline at end of file
...@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) ...@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0) if(gpu IN_LIST gpu_list AND target EQUAL 0)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_custom_target(test_gemm_layernorm) add_custom_target(test_gemm_layernorm)
add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp) add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance) target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16) add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
set(target 1) set(target 1)
endif()
endif() endif()
endforeach() endforeach()
add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility) add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance) target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility)
target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
endif()
\ No newline at end of file
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940") if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp) add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data.cpp)
target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface.cpp) add_gtest_executable(test_grouped_convnd_bwd_data_interface test_grouped_convnd_bwd_data_interface.cpp)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
endif() endif()
\ No newline at end of file
...@@ -46,23 +46,36 @@ class TestGroupedConvndBwdData : public ::testing::Test ...@@ -46,23 +46,36 @@ class TestGroupedConvndBwdData : public ::testing::Test
} }
}; };
using GNHWC = ck::tensor_layout::convolution::GNHWC; using namespace ck::tensor_layout::convolution;
using NHWGC = ck::tensor_layout::convolution::NHWGC;
using GKYXC = ck::tensor_layout::convolution::GKYXC; using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
std::tuple<float, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
using GNHWK = ck::tensor_layout::convolution::GNHWK; using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
using NHWGK = ck::tensor_layout::convolution::NHWGK; std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
using KernelTypes = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>, template <typename Tuple>
std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>, class TestGroupedConvndBwdData2d : public TestGroupedConvndBwdData<Tuple>
std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>, {
std::tuple<float, NHWGK, GKYXC, NHWGC>, };
std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
TYPED_TEST_SUITE(TestGroupedConvndBwdData, KernelTypes);
TYPED_TEST(TestGroupedConvndBwdData, Test2D) template <typename Tuple>
class TestGroupedConvndBwdData3d : public TestGroupedConvndBwdData<Tuple>
{
};
TYPED_TEST_SUITE(TestGroupedConvndBwdData2d, KernelTypes2d);
TYPED_TEST_SUITE(TestGroupedConvndBwdData3d, KernelTypes3d);
TYPED_TEST(TestGroupedConvndBwdData2d, Test2D)
{ {
this->conv_params.clear(); this->conv_params.clear();
...@@ -74,5 +87,26 @@ TYPED_TEST(TestGroupedConvndBwdData, Test2D) ...@@ -74,5 +87,26 @@ TYPED_TEST(TestGroupedConvndBwdData, Test2D)
{2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back( this->conv_params.push_back(
{2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->template Run<2>(); this->template Run<2>();
} }
TYPED_TEST(TestGroupedConvndBwdData3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->template Run<3>();
}
...@@ -85,7 +85,10 @@ using KernelTypes2d = ::testing::Types< ...@@ -85,7 +85,10 @@ using KernelTypes2d = ::testing::Types<
using KernelTypes3d = ::testing::Types< using KernelTypes3d = ::testing::Types<
std::tuple<float, float, float, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>, std::tuple<float, float, float, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>, std::tuple<ck::half_t, ck::half_t, ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
std::tuple<ck::bhalf_t, float, ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>>; std::tuple<ck::bhalf_t, float, ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
std::tuple<float, float, float, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
std::tuple<ck::half_t, ck::half_t, ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
std::tuple<ck::bhalf_t, float, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>>;
TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d); TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d);
TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d); TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d);
...@@ -97,6 +100,9 @@ TYPED_TEST(TestGroupedConvndBwdWeight1d, Test1D) ...@@ -97,6 +100,9 @@ TYPED_TEST(TestGroupedConvndBwdWeight1d, Test1D)
this->conv_params.push_back({1, 2, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); this->conv_params.push_back({1, 2, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
this->conv_params.push_back({1, 2, 32, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); this->conv_params.push_back({1, 2, 32, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 2, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); this->conv_params.push_back({1, 2, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
this->conv_params.push_back({1, 1, 1, 1, 32, {3}, {32}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 1, 64, 3, {3}, {32}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 1, 1, 1, {3}, {32}, {1}, {1}, {1}, {1}});
this->Run(); this->Run();
} }
...@@ -109,6 +115,9 @@ TYPED_TEST(TestGroupedConvndBwdWeight2d, Test2D) ...@@ -109,6 +115,9 @@ TYPED_TEST(TestGroupedConvndBwdWeight2d, Test2D)
{2, 2, 4, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); {2, 2, 4, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back( this->conv_params.push_back(
{2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 1, 1, 32, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 64, 3, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 1, 1, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->Run(); this->Run();
} }
...@@ -121,5 +130,11 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D) ...@@ -121,5 +130,11 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
{3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back( this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->Run(); this->Run();
} }
...@@ -70,10 +70,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test ...@@ -70,10 +70,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>( ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
conv_param); conv_param);
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{}; std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{}; std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{}; std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_strides{}; std::array<ck::index_t, NDimSpatial + 3> input_strides{};
std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
std::array<ck::index_t, NDimSpatial + 3> output_strides{}; std::array<ck::index_t, NDimSpatial + 3> output_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
...@@ -82,10 +83,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test ...@@ -82,10 +83,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); }; auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths)); range_copy(in_g_n_c_wis_desc.GetLengths(), begin(input_lengths));
range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides)); range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
range_copy(wei_g_k_c_xs_desc.GetLengths(), begin(filter_lengths));
range_copy(wei_g_k_c_xs_desc.GetStrides(), begin(weights_strides));
range_copy(out_g_n_k_wos_desc.GetLengths(), begin(output_lengths));
range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides)); range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides)); range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations)); range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
...@@ -97,14 +99,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test ...@@ -97,14 +99,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
auto argument = conv.MakeArgument(nullptr, auto argument = conv.MakeArgument(nullptr,
nullptr, nullptr,
nullptr, nullptr,
conv_param.G_, input_lengths,
conv_param.N_,
conv_param.K_,
conv_param.C_,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_strides, input_strides,
filter_lengths,
weights_strides,
output_lengths,
output_strides, output_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
......
...@@ -22,6 +22,8 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv1dFwdGNWC) ...@@ -22,6 +22,8 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv1dFwdGNWC)
conv_params.push_back({1, 2, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}}); conv_params.push_back({1, 2, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
conv_params.push_back({1, 2, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}}); conv_params.push_back({1, 2, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
conv_params.push_back({1, 2, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}}); conv_params.push_back({1, 2, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
conv_params.push_back({1, 1, 1, 1, 32, {3}, {32}, {1}, {1}, {1}, {1}});
conv_params.push_back({1, 1, 1, 64, 3, {3}, {32}, {1}, {1}, {1}, {1}});
for(auto& param : conv_params) for(auto& param : conv_params)
{ {
...@@ -96,6 +98,9 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdGNHWC) ...@@ -96,6 +98,9 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdGNHWC)
conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
conv_params.push_back({2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); conv_params.push_back({2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
conv_params.push_back({2, 1, 1, 1, 32, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
conv_params.push_back({2, 1, 1, 64, 3, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
conv_params.push_back({2, 1, 1, 1, 1, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
for(auto& param : conv_params) for(auto& param : conv_params)
{ {
...@@ -173,6 +178,12 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv3dFwdGNDHWC) ...@@ -173,6 +178,12 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv3dFwdGNDHWC)
{3, 2, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); {3, 2, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
conv_params.push_back( conv_params.push_back(
{3, 2, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); {3, 2, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
conv_params.push_back(
{3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
conv_params.push_back(
{3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
for(auto& param : conv_params) for(auto& param : conv_params)
{ {
...@@ -247,6 +258,9 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdNHWGC) ...@@ -247,6 +258,9 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdNHWGC)
conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
conv_params.push_back({2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); conv_params.push_back({2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
conv_params.push_back({2, 1, 1, 1, 32, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
conv_params.push_back({2, 1, 1, 64, 3, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
conv_params.push_back({2, 1, 1, 1, 1, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
for(auto& param : conv_params) for(auto& param : conv_params)
{ {
...@@ -255,7 +269,7 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdNHWGC) ...@@ -255,7 +269,7 @@ TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdNHWGC)
// fp16 // fp16
pass = ck::profiler::profile_grouped_conv_fwd_impl<2, pass = ck::profiler::profile_grouped_conv_fwd_impl<2,
ck::tensor_layout::convolution::NHWGC, ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::KYXGC, ck::tensor_layout::convolution::GKYXC,
ck::tensor_layout::convolution::NHWGK, ck::tensor_layout::convolution::NHWGK,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
......
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
...@@ -12,3 +13,4 @@ foreach(gpu IN LISTS GPU_TARGETS) ...@@ -12,3 +13,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
endif()
...@@ -108,7 +108,7 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops) ...@@ -108,7 +108,7 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
// kloops % 2 // kloops % 2
Ks = std::vector<int>{256, 512, 320, 768}; Ks = std::vector<int>{256, 512, 320, 768};
EXPECT_FALSE( EXPECT_TRUE(
DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch)); DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
// Not all gemms have same value for main_k0_block_loop! // Not all gemms have same value for main_k0_block_loop!
......
...@@ -147,14 +147,14 @@ struct DeviceGroupedGemmSplitkInstanceWrapper ...@@ -147,14 +147,14 @@ struct DeviceGroupedGemmSplitkInstanceWrapper
32, 32,
4, 4,
2, 2,
S<1, 4, 32, 1>, S<1, 4, 16, 1>,
ABlockTransferThreadClusterArrageOrder, ABlockTransferThreadClusterArrageOrder,
ABlockTransferSrcAccessOrder, ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim::value, ABlockTransferSrcVectorDim::value,
ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1::value, ABlockTransferDstScalarPerVector_K1::value,
ABlockLdsAddExtraM::value, ABlockLdsAddExtraM::value,
S<1, 4, 32, 1>, S<1, 4, 16, 1>,
BBlockTransferThreadClusterArrageOrder, BBlockTransferThreadClusterArrageOrder,
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim::value, BBlockTransferSrcVectorDim::value,
......
add_gtest_executable(test_image_to_column test_image_to_column.cpp)
target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance)
add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp)
target_link_libraries(test_image_to_column_interface PRIVATE utility)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment