Commit 1dbdab56 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parents d2e49b23 bac7df8f
......@@ -36,7 +36,6 @@ namespace profiler {
enum struct NormType
{
LAYERNORM,
BATCHNORM,
SOFTMAX,
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/include/profile_layernorm_impl.hpp"
using ck::index_t;
struct LayernormArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {
{"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
void print_help_layernorm()
{
std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=n0, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1024 1024) \n"
<< "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
<< "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
<< "--strideBeta: tensor strides (e.g, --strideBeta 1)\n"
<< std::endl;
}
int profile_layernorm(int argc, char* argv[])
{
if(argc <= 2)
{
print_help_layernorm();
return 0;
}
LayernormArgParser arg_parser;
// short unnamed options
const ck::DataTypeEnum data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
// parse the long options
arg_parser(argc, argv);
const std::vector<index_t> length = arg_parser.long_opts["length"];
const std::vector<index_t> strideXY = arg_parser.long_opts["strideXY"];
const std::vector<index_t> strideGamma = arg_parser.long_opts["strideGamma"];
const std::vector<index_t> strideBeta = arg_parser.long_opts["strideBeta"];
using F16 = ck::half_t;
using F32 = float;
constexpr int rank = 2;
if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(do_verification,
init_method,
do_log,
time_kernel,
length,
strideXY,
strideGamma,
strideBeta);
}
else if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(do_verification,
init_method,
do_log,
time_kernel,
length,
strideXY,
strideGamma,
strideBeta);
}
else
{
throw std::runtime_error("not implemented yet");
}
return 0;
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_layernorm(argc, argv);
// return 0;
// }
......@@ -13,8 +13,7 @@ using ck::profiler::NormType;
struct ArgParser
{
std::unordered_map<std::string, NormType> norm_dict = {{"layernorm", NormType::LAYERNORM},
{"batchnorm", NormType::BATCHNORM},
std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
{"softmax", NormType::SOFTMAX}};
std::unordered_map<std::string, std::vector<int>> long_opts = {
......
......@@ -19,6 +19,7 @@ int profile_conv_bwd_data(int, char*[]);
int profile_conv_bwd_weight(int, char*[]);
int profile_grouped_conv_fwd(int, char*[]);
int profile_normalization(int, char*[]);
int profile_layernorm(int, char*[]);
int profile_reduce(int, char*[]);
static void print_helper_message()
......@@ -115,11 +116,14 @@ int main(int argc, char* argv[])
{
return profile_reduce(argc, argv);
}
else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "layernorm") == 0 ||
strcmp(argv[1], "softmax") == 0)
else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0)
{
return profile_normalization(argc, argv);
}
else if(strcmp(argv[1], "layernorm") == 0)
{
return profile_layernorm(argc, argv);
}
else
{
print_helper_message();
......
......@@ -40,6 +40,8 @@ add_subdirectory(gemm_split_k)
add_subdirectory(gemm_reduce)
add_subdirectory(batched_gemm)
add_subdirectory(batched_gemm_reduce)
add_subdirectory(batched_gemm_gemm)
add_subdirectory(batched_gemm_softmax_gemm)
add_subdirectory(grouped_gemm)
add_subdirectory(reduce)
add_subdirectory(convnd_fwd)
......
add_custom_target(test_batched_gemm_gemm)
add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_batched_gemm_gemm_util.hpp"
template <typename Tuple>
class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
{
};
// clang-format off
using KernelTypes = ::testing::Types<
std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>
>;
// clang-format on
TYPED_TEST_SUITE(TestBatchedGemmGemmFP16, KernelTypes);
TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16) { this->Run(); }
TYPED_TEST(TestBatchedGemmGemmFP16, DISABLED_Bench_FP16)
{
this->lengths_ = std::vector<std::vector<int>>{
{256, 256, 64, 64, 768},
{256, 256, 128, 128, 768},
{512, 512, 64, 64, 768},
{512, 512, 128, 128, 768},
{1024, 1024, 64, 64, 768},
{1024, 1024, 128, 128, 768},
{2048, 2048, 64, 64, 768},
{2048, 2048, 128, 128, 768},
{4096, 4096, 64, 64, 768},
{4096, 4096, 128, 128, 768},
};
this->bench_ = true;
this->verify_ = false;
this->Run();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
template <ck::index_t N>
using I = ck::Number<N>;
using F16 = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <typename Tuple>
struct TestBatchedGemmGemm : public ::testing::Test
{
using ADataType = std::tuple_element_t<0, Tuple>;
using B0DataType = std::tuple_element_t<1, Tuple>;
using B1DataType = std::tuple_element_t<2, Tuple>;
using CDataType = std::tuple_element_t<3, Tuple>;
using ALayout = std::tuple_element_t<4, Tuple>;
using B0Layout = std::tuple_element_t<5, Tuple>;
using B1Layout = std::tuple_element_t<6, Tuple>;
using CLayout = std::tuple_element_t<7, Tuple>;
std::vector<std::vector<int>> lengths_ = {
{256, 256, 64, 64, 4},
{256, 256, 128, 128, 4},
{512, 512, 64, 64, 2},
{512, 512, 128, 128, 2},
{1024, 1024, 64, 64, 1},
{1024, 1024, 128, 128, 1},
};
bool bench_ = false;
bool verify_ = true;
void RunSingle(int M, int N, int K, int O, int BatchCount)
{
bool pass = ck::profiler::profile_batched_gemm_gemm_impl<ADataType,
B0DataType,
B1DataType,
CDataType,
ALayout,
B0Layout,
B1Layout,
CLayout>(
verify_, 1, false, bench_, M, N, K, O, BatchCount);
EXPECT_TRUE(pass);
}
void Run()
{
for(auto lengths : this->lengths_)
{
int M = lengths[0];
int N = lengths[1];
int K = lengths[2];
int O = lengths[3];
int BatchCount = lengths[4];
this->RunSingle(M, N, K, O, BatchCount);
}
}
};
add_custom_target(test_batched_gemm_softmax_gemm)
add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_batched_gemm_softmax_gemm_util.hpp"
template <typename Tuple>
class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm<Tuple>
{
};
// clang-format off
using KernelTypes = ::testing::Types<
std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>
>;
// clang-format on
TYPED_TEST_SUITE(TestBatchedGemmSoftmaxGemmFP16, KernelTypes);
TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16) { this->Run(); }
TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, DISABLED_Bench_FP16)
{
this->lengths_ = std::vector<std::vector<int>>{
{256, 256, 64, 64, 768},
{256, 256, 128, 128, 768},
{512, 512, 64, 64, 768},
{512, 512, 128, 128, 768},
{1024, 1024, 64, 64, 768},
{1024, 1024, 128, 128, 768},
{2048, 2048, 64, 64, 768},
{2048, 2048, 128, 128, 768},
{4096, 4096, 64, 64, 768},
{4096, 4096, 128, 128, 768},
};
this->bench_ = true;
this->verify_ = false;
this->Run();
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
template <ck::index_t N>
using I = ck::Number<N>;
using F16 = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
template <typename Tuple>
struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
{
using ADataType = std::tuple_element_t<0, Tuple>;
using B0DataType = std::tuple_element_t<1, Tuple>;
using B1DataType = std::tuple_element_t<2, Tuple>;
using CDataType = std::tuple_element_t<3, Tuple>;
using ALayout = std::tuple_element_t<4, Tuple>;
using B0Layout = std::tuple_element_t<5, Tuple>;
using B1Layout = std::tuple_element_t<6, Tuple>;
using CLayout = std::tuple_element_t<7, Tuple>;
std::vector<std::vector<int>> lengths_ = {
{256, 256, 64, 64, 4},
{256, 256, 128, 128, 4},
{512, 512, 64, 64, 2},
{512, 512, 128, 128, 2},
{1024, 1024, 64, 64, 1},
{1024, 1024, 128, 128, 1},
};
bool bench_ = false;
bool verify_ = true;
void RunSingle(int M, int N, int K, int O, int BatchCount)
{
bool pass = ck::profiler::profile_batched_gemm_softmax_gemm_impl<ADataType,
B0DataType,
B1DataType,
CDataType,
ALayout,
B0Layout,
B1Layout,
CLayout>(
verify_, 1, false, bench_, M, N, K, O, BatchCount);
EXPECT_TRUE(pass);
}
void Run()
{
for(auto lengths : this->lengths_)
{
int M = lengths[0];
int N = lengths[1];
int K = lengths[2];
int O = lengths[3];
int BatchCount = lengths[4];
this->RunSingle(M, N, K, O, BatchCount);
}
}
};
......@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/host_tensor.hpp"
......@@ -63,24 +63,24 @@ class TestLayernorm : public ::testing::Test
Rank,
NumReduceDim>;
using DeviceInstance = tensor_operation::device::DeviceLayernorm<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYSrcVectorDim,
XSrcVectorSize,
GammaSrcVectorSize,
BetaSrcVectorSize,
YDstVectorSize>;
using DeviceInstance = tensor_operation::device::DeviceLayernormImpl<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYSrcVectorDim,
XSrcVectorSize,
GammaSrcVectorSize,
BetaSrcVectorSize,
YDstVectorSize>;
TestLayernorm() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}
......@@ -119,6 +119,7 @@ class TestLayernorm : public ::testing::Test
gamma.mDesc.GetStrides().end()},
std::vector<ck::index_t>{beta.mDesc.GetStrides().begin(),
beta.mDesc.GetStrides().end()},
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
reduceDims,
1e-4,
x_dev.GetDeviceBuffer(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment