Commit f64b1375 authored by coderfeli's avatar coderfeli
Browse files

merge haocong branch

parents 88412f9e f18cfec4
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_universal_impl.hpp" #include "profiler/profile_gemm_universal_impl.hpp"
#include "profiler_operation_registry.hpp" #include "profiler_operation_registry.hpp"
...@@ -27,8 +27,6 @@ enum struct GemmDataType ...@@ -27,8 +27,6 @@ enum struct GemmDataType
F16_F8_F16, // 5 F16_F8_F16, // 5
F16_F16_F16_F8, // 6 F16_F16_F16_F8, // 6
F8_F8_BF16, // 7 F8_F8_BF16, // 7
F16_I4_F16, // 8
BF16_I4_BF16, // 9
}; };
#define OP_NAME "gemm_universal" #define OP_NAME "gemm_universal"
...@@ -41,7 +39,7 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -41,7 +39,7 @@ int profile_gemm_universal(int argc, char* argv[])
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: " printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
"f16->f8; 7: f8->bf16, " "f16->f8; 7: f8->bf16, "
"comp f8; 8: f16@i4; 9: bf16@i4\n"); "comp f8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n"); printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
...@@ -105,7 +103,6 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -105,7 +103,6 @@ int profile_gemm_universal(int argc, char* argv[])
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
using F8 = ck::f8_t; using F8 = ck::f8_t;
using I4 = ck::pk_i4_t;
#endif #endif
using Row = ck::tensor_layout::gemm::RowMajor; using Row = ck::tensor_layout::gemm::RowMajor;
...@@ -210,14 +207,6 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -210,14 +207,6 @@ int profile_gemm_universal(int argc, char* argv[])
{ {
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{}); return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
} }
else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::BF16_I4_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(BF16{}, I4{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
}
#endif #endif
else else
{ {
......
...@@ -83,9 +83,8 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) ...@@ -83,9 +83,8 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
rotating = std::stoull(argv[18]) * 1024 * 1024; rotating = std::stoull(argv[18]) * 1024 * 1024;
} }
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
using F8 = ck::f8_t; using F8 = ck::f8_t;
...@@ -166,22 +165,6 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) ...@@ -166,22 +165,6 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
} }
#endif #endif
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
}
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
{
return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
}
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
{
return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{});
}
else else
{ {
std::cout << "this data_type & layout is not implemented" << std::endl; std::cout << "this data_type & layout is not implemented" << std::endl;
......
...@@ -17,11 +17,11 @@ enum struct GemmMatrixLayout ...@@ -17,11 +17,11 @@ enum struct GemmMatrixLayout
enum struct GemmDataType enum struct GemmDataType
{ {
BF16_I8_BF16, // 0 BF16_I8_BF16, // 0
F16_F16_F16, // 1 F16_F16_F16, // 1
F16_F8_F16, // 2 F16_F8_F16, // 2
F16_I8_F16, // 3 F16_I8_F16, // 3
BF16_BF16_BF16 // 4
}; };
#define OP_NAME "grouped_gemm_fixed_nk" #define OP_NAME "grouped_gemm_fixed_nk"
...@@ -39,6 +39,7 @@ std::vector<int> argToIntArray(char* input) ...@@ -39,6 +39,7 @@ std::vector<int> argToIntArray(char* input)
{ {
out.push_back(std::stoi(item)); out.push_back(std::stoi(item));
} }
return out; return out;
} }
...@@ -82,6 +83,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -82,6 +83,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
const auto StrideCs = argToIntArray(argv[13]); const auto StrideCs = argToIntArray(argv[13]);
const int kbatch = argc >= 15 ? std::stoi(argv[14]) : 1; const int kbatch = argc >= 15 ? std::stoi(argv[14]) : 1;
using F32 = float;
using F16 = ck::half_t;
#if defined(CK_ENABLE_FP8)
using F8 = ck::f8_t;
#endif
using BF16 = ck::bhalf_t;
using I8 = int8_t;
int n_warmup = 1; int n_warmup = 1;
int n_iter = 10; int n_iter = 10;
if(argc == 17) if(argc == 17)
...@@ -90,59 +99,13 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -90,59 +99,13 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_iter = std::stoi(argv[16]); n_iter = std::stoi(argv[16]);
} }
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
{ if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
#if defined(CK_ENABLE_FP8)
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
ck::f8_t, I8,
ck::half_t, BF16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -160,12 +123,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -160,12 +123,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
ck::f8_t, I8,
ck::half_t, BF16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -183,14 +146,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -183,14 +146,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
#endif // CK_ENABLE_FP8 #endif
#if defined(CK_ENABLE_INT8) #if defined(CK_ENABLE_FP16)
else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
int8_t, F16,
ck::half_t, F16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -208,12 +171,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -208,12 +171,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
int8_t, F16,
ck::half_t, F16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -231,14 +194,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -231,14 +194,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
#endif // CK_ENABLE_INT8 #endif
#if defined(CK_ENABLE_BF16) #if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
ck::bhalf_t, F8,
ck::bhalf_t, F16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -256,12 +219,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -256,12 +219,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
ck::bhalf_t, F8,
ck::bhalf_t, F16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -279,13 +242,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -279,13 +242,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
#if defined(CK_ENABLE_INT8) #endif
else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN) #if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
int8_t, I8,
ck::bhalf_t, F16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -303,12 +267,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -303,12 +267,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
n_warmup, n_warmup,
n_iter); n_iter);
} }
else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t, ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
int8_t, I8,
ck::bhalf_t, F16,
float, F32,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>( ck::tensor_layout::gemm::RowMajor>(
...@@ -322,12 +286,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -322,12 +286,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
StrideAs, StrideAs,
StrideBs, StrideBs,
StrideCs, StrideCs,
kbatch, 1,
n_warmup, n_warmup,
n_iter); n_iter);
} }
#endif // CK_ENABLE_INT8 #endif
#endif // CK_ENABLE_BF16
else else
{ {
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
......
...@@ -17,7 +17,7 @@ fi ...@@ -17,7 +17,7 @@ fi
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm/ \ -D CMAKE_PREFIX_PATH=/opt/rocm/ \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \ -D BUILD_DEV=ON \
-D GPU_TARGETS=$GPU_TARGETS \ -D GPU_TARGETS=$GPU_TARGETS \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment