Commit 930b2872 authored by Harisankar Sadasivan's avatar Harisankar Sadasivan
Browse files

best performing kernel for GEMV codex problem with M=1 with inverted B matrix

parents a1e17d18 a4f72a31
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "profiler/profile_grouped_conv_fwd_impl.hpp"
template <typename Tuple>
class TestGroupedConvndFwd : public ::testing::Test
{
protected:
using DataType = std::tuple_element_t<0, Tuple>;
using InLayout = std::tuple_element_t<1, Tuple>;
using WeiLayout = std::tuple_element_t<2, Tuple>;
using OutLayout = std::tuple_element_t<3, Tuple>;
std::vector<ck::utils::conv::ConvParam> conv_params;
template <ck::index_t NDimSpatial>
void Run()
{
EXPECT_FALSE(conv_params.empty());
bool pass = true;
for(auto& param : conv_params)
{
pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
DataType,
DataType,
DataType>(
true, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel
param);
}
EXPECT_TRUE(pass);
}
};
using namespace ck::tensor_layout::convolution;
using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK>,
std::tuple<ck::half_t, GNWC, GKXC, GNWK>,
std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK>,
std::tuple<int8_t, GNWC, GKXC, GNWK>>;
using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK>,
std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK>,
std::tuple<int8_t, GNHWC, GKYXC, GNHWK>,
std::tuple<float, NHWGC, GKYXC, NHWGK>,
std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;
using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,
std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK>,
std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK>,
std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;
template <typename Tuple>
class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>
{
};
template <typename Tuple>
class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
{
};
template <typename Tuple>
class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
{
};
TYPED_TEST_SUITE(TestGroupedConvndFwd1d, KernelTypes1d);
TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
TYPED_TEST(TestGroupedConvndFwd1d, Test1D)
{
this->conv_params.clear();
this->conv_params.push_back({1, 2, 32, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
this->conv_params.push_back({1, 2, 32, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 2, 32, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
this->conv_params.push_back({1, 1, 1, 1, 32, {3}, {32}, {1}, {1}, {1}, {1}});
this->conv_params.push_back({1, 1, 1, 64, 3, {3}, {32}, {1}, {1}, {1}, {1}});
this->template Run<1>();
}
TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
{
this->conv_params.clear();
this->conv_params.push_back(
{2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back(
{2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back(
{2, 2, 32, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
this->conv_params.push_back({2, 1, 1, 1, 32, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 64, 3, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->conv_params.push_back({2, 1, 1, 1, 1, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
this->template Run<2>();
}
TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
{
this->conv_params.clear();
this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
this->conv_params.push_back(
{3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->conv_params.push_back(
{3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
this->template Run<3>();
}
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
...@@ -13,4 +12,3 @@ foreach(gpu IN LISTS GPU_TARGETS) ...@@ -13,4 +12,3 @@ foreach(gpu IN LISTS GPU_TARGETS)
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
endif()
...@@ -4,7 +4,7 @@ TEST_P(RRR_F16_F16_F16, TinyCases) ...@@ -4,7 +4,7 @@ TEST_P(RRR_F16_F16_F16, TinyCases)
{ {
const std::vector<int> Ms{0, 1}; const std::vector<int> Ms{0, 1};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 544; constexpr int K = 1088;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -17,9 +17,9 @@ TEST_P(RRR_F16_F16_F16, TinyCases) ...@@ -17,9 +17,9 @@ TEST_P(RRR_F16_F16_F16, TinyCases)
TEST_P(RRR_F16_F16_F16, SmallCases) TEST_P(RRR_F16_F16_F16, SmallCases)
{ {
const std::vector<int> Ms{2, 1, 3, 4, 5, 0}; const std::vector<int> Ms{2, 3, 4, 5};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 544; constexpr int K = 1088;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -34,7 +34,7 @@ TEST_P(RRR_F16_F16_F16, MidCases) ...@@ -34,7 +34,7 @@ TEST_P(RRR_F16_F16_F16, MidCases)
{ {
const std::vector<int> Ms{167, 183, 177, 153, 139, 204}; const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 544; constexpr int K = 1088;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -49,7 +49,7 @@ TEST_P(RRR_F16_F16_F16, Regular) ...@@ -49,7 +49,7 @@ TEST_P(RRR_F16_F16_F16, Regular)
{ {
const std::vector<int> Ms{64, 128, 256}; const std::vector<int> Ms{64, 128, 256};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 320; constexpr int K = 640;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -79,7 +79,7 @@ TEST_P(RCR_F16_F16_F16, TinyCases) ...@@ -79,7 +79,7 @@ TEST_P(RCR_F16_F16_F16, TinyCases)
{ {
const std::vector<int> Ms{0, 1}; const std::vector<int> Ms{0, 1};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 544; constexpr int K = 1088;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -91,9 +91,9 @@ TEST_P(RCR_F16_F16_F16, TinyCases) ...@@ -91,9 +91,9 @@ TEST_P(RCR_F16_F16_F16, TinyCases)
TEST_P(RCR_F16_F16_F16, SmallCases) TEST_P(RCR_F16_F16_F16, SmallCases)
{ {
const std::vector<int> Ms{2, 1, 3, 4, 5, 0}; const std::vector<int> Ms{2, 3, 4, 5};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 544; constexpr int K = 1088;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -123,7 +123,7 @@ TEST_P(RCR_F16_F16_F16, Regular) ...@@ -123,7 +123,7 @@ TEST_P(RCR_F16_F16_F16, Regular)
{ {
const std::vector<int> Ms{32, 64, 128, 256}; const std::vector<int> Ms{32, 64, 128, 256};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 320; constexpr int K = 640;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
...@@ -151,9 +151,9 @@ TEST_P(RCR_F16_F16_F16, MNKPadded) ...@@ -151,9 +151,9 @@ TEST_P(RCR_F16_F16_F16, MNKPadded)
TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch) TEST_P(RRR_F16_F16_F16_LargeK, TestLargeKBatch)
{ {
const std::vector<int> Ms{188, 210}; const std::vector<int> Ms{127, 150, 188, 210};
constexpr int N = 768; constexpr int N = 768;
constexpr int K = 4096; constexpr int K = 8192;
const std::vector<int> Ns(Ms.size(), N); const std::vector<int> Ns(Ms.size(), N);
const std::vector<int> Ks(Ms.size(), K); const std::vector<int> Ks(Ms.size(), K);
......
if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) add_custom_target(test_normalization)
add_custom_target(test_normalization) add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
endif() if(result EQUAL 0)
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance) target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance)
target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
add_dependencies(test_normalization test_layernorm2d_fp32) add_dependencies(test_normalization test_layernorm2d_fp32)
endif()
add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
if(result EQUAL 0)
target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
add_dependencies(test_normalization test_groupnorm_fp32) add_dependencies(test_normalization test_groupnorm_fp32)
endif() endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) if(result EQUAL 0)
add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance) target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance)
target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
add_dependencies(test_normalization test_layernorm2d_fp16) add_dependencies(test_normalization test_layernorm2d_fp16)
endif()
add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
if(result EQUAL 0)
target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
add_dependencies(test_normalization test_groupnorm_fp16) add_dependencies(test_normalization test_groupnorm_fp16)
endif() endif()
add_test_executable(test_reduce_no_index reduce_no_index.cpp) add_test_executable(test_reduce_no_index reduce_no_index.cpp)
add_test_executable(test_reduce_with_index reduce_with_index.cpp) add_test_executable(test_reduce_with_index reduce_with_index.cpp)
target_link_libraries(test_reduce_no_index PRIVATE utility) target_link_libraries(test_reduce_no_index PRIVATE utility device_reduce_instance)
target_link_libraries(test_reduce_no_index PRIVATE device_reduce_instance) target_link_libraries(test_reduce_with_index PRIVATE utility device_reduce_instance)
target_link_libraries(test_reduce_with_index PRIVATE utility)
target_link_libraries(test_reduce_with_index PRIVATE device_reduce_instance)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment