"app/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "42998d797d79c790354625d387b50e692c1e27f0"
Unverified Commit aebd211c authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

External Interface (#304)

* add client example

* clean

* clean

* reorg

* clean up profiler

* reorg

* clea

* fix profiler

* function for getinstances

* update client example

* update client example

* update client example

* update

* update example

* update Jenkins file

* update cmake

* update Jenkins
parent b653c5eb
...@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_in ...@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_in
>; >;
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
std::vector< std::vector<DeviceBatchedGemmReducePtr<PassThrough,
DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>& PassThrough,
instances) PassThrough,
DInElementOps,
DOutElementOps>>& instances)
{ {
add_device_operation_instances( add_device_operation_instances(
instances, instances,
......
...@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_in ...@@ -67,9 +67,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_in
>; >;
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
std::vector< std::vector<DeviceBatchedGemmReducePtr<PassThrough,
DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>& PassThrough,
instances) PassThrough,
DInElementOps,
DOutElementOps>>& instances)
{ {
add_device_operation_instances( add_device_operation_instances(
instances, instances,
......
...@@ -64,9 +64,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_in ...@@ -64,9 +64,11 @@ using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_in
>; >;
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
std::vector< std::vector<DeviceBatchedGemmReducePtr<PassThrough,
DeviceGemmReducePtr<PassThrough, PassThrough, PassThrough, DInElementOps, DOutElementOps>>& PassThrough,
instances) PassThrough,
DInElementOps,
DOutElementOps>>& instances)
{ {
add_device_operation_instances( add_device_operation_instances(
instances, instances,
......
...@@ -28,14 +28,6 @@ set(DEVICE_GEMM_INSTANCE_SOURCE ...@@ -28,14 +28,6 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp; device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp;
device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp; device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp;
device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp; device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp; device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp;
device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp; device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp;
device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp; device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp;
......
set(DEVICE_GEMM_SPLITK_INSTANCE_SOURCE
device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp;
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
)
add_library(device_gemm_splitk_instance OBJECT ${DEVICE_GEMM_SPLITK_INSTANCE_SOURCE})
target_compile_features(device_gemm_splitk_instance PUBLIC)
set_target_properties(device_gemm_splitk_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances = std::tuple< ...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances( void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances{}); device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances{});
......
...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances = std::tuple< ...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances( void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances{}); device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances{});
......
...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< ...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{}); device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{});
......
...@@ -83,7 +83,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< ...@@ -83,7 +83,7 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
// >; // >;
void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{}); device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{});
......
...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple< ...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances( void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances{}); device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances{});
......
...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple< ...@@ -46,7 +46,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances( void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances{}); device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances{});
......
...@@ -51,7 +51,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple< ...@@ -51,7 +51,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances( void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances{}); device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances{});
......
...@@ -51,7 +51,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple< ...@@ -51,7 +51,7 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple<
>; >;
void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances) std::vector<DeviceGemmSplitKPtr<PassThrough, PassThrough, PassThrough>>& instances)
{ {
add_device_operation_instances(instances, add_device_operation_instances(instances,
device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances{}); device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances{});
......
...@@ -6,6 +6,7 @@ include_directories(BEFORE ...@@ -6,6 +6,7 @@ include_directories(BEFORE
set(PROFILER_SOURCE set(PROFILER_SOURCE
src/profiler.cpp src/profiler.cpp
src/profile_gemm.cpp src/profile_gemm.cpp
src/profile_gemm_splitk.cpp
src/profile_gemm_bias_2d.cpp src/profile_gemm_bias_2d.cpp
src/profile_gemm_bias_relu.cpp src/profile_gemm_bias_relu.cpp
src/profile_gemm_bias_relu_add.cpp src/profile_gemm_bias_relu_add.cpp
...@@ -27,21 +28,22 @@ add_executable(ckProfiler ${PROFILER_SOURCE}) ...@@ -27,21 +28,22 @@ add_executable(ckProfiler ${PROFILER_SOURCE})
target_link_libraries(ckProfiler PRIVATE host_tensor) target_link_libraries(ckProfiler PRIVATE host_tensor)
target_link_libraries(ckProfiler PRIVATE conv_util) target_link_libraries(ckProfiler PRIVATE conv_util)
target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_splitk_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_conv1d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv1d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv3d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv3d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance)
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp" #include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -29,7 +29,7 @@ using Square = ck::tensor_operation::element_wise::UnarySquare; ...@@ -29,7 +29,7 @@ using Square = ck::tensor_operation::element_wise::UnarySquare;
using DInElementOps = ck::Tuple<Identity, Square>; using DInElementOps = ck::Tuple<Identity, Square>;
using DOutElementOps = ck::Tuple<Identity, Identity>; using DOutElementOps = ck::Tuple<Identity, Identity>;
using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePtr< using DeviceBatchedGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceBatchedGemmReducePtr<
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
...@@ -37,16 +37,16 @@ using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePt ...@@ -37,16 +37,16 @@ using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePt
DOutElementOps>; DOutElementOps>;
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&); std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&); std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&); std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&); std::vector<DeviceBatchedGemmReduceNoOpPtr>&);
} // namespace device_gemm_instance } // namespace device_gemm_instance
} // namespace device } // namespace device
...@@ -204,7 +204,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -204,7 +204,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
b_device_buf.ToDevice(b_g_k_n.mData.data()); b_device_buf.ToDevice(b_g_k_n.mData.data());
// add device GEMM instances // add device GEMM instances
std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmReduceNoOpPtr> std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceBatchedGemmReduceNoOpPtr>
gemm_ptrs; gemm_ptrs;
if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value && if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace ck {
namespace profiler {
int profile_convnd_fwd(int argc, char* argv[]);
} // namespace profiler
} // namespace ck
...@@ -9,6 +9,9 @@ ...@@ -9,6 +9,9 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
...@@ -16,31 +19,6 @@ ...@@ -16,31 +19,6 @@
#include "ck/library/host_tensor/host_conv.hpp" #include "ck/library/host_tensor/host_conv.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
using DeviceGemmAddAddFastGeluPtr = ck::tensor_operation::device::DeviceGemmMultipleDPtr<
2,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::AddAddFastGelu>;
void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
std::vector<DeviceGemmAddAddFastGeluPtr>&);
void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGemmAddAddFastGeluPtr>&);
void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
std::vector<DeviceGemmAddAddFastGeluPtr>&);
void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
std::vector<DeviceGemmAddAddFastGeluPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -55,18 +33,18 @@ template <typename ADataType, ...@@ -55,18 +33,18 @@ template <typename ADataType,
typename D0Layout, typename D0Layout,
typename D1Layout, typename D1Layout,
typename ELayout> typename ELayout>
int profile_gemm_add_add_fastgelu_impl(int do_verification, bool profile_gemm_add_add_fastgelu_impl(int do_verification,
int init_method, int init_method,
bool /*do_log*/, bool /*do_log*/,
bool time_kernel, bool time_kernel,
int M, int M,
int N, int N,
int K, int K,
int StrideA, int StrideA,
int StrideB, int StrideB,
int StrideD0, int StrideD0,
int StrideD1, int StrideD1,
int StrideE) int StrideE)
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
...@@ -122,48 +100,21 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -122,48 +100,21 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{}; const auto cde_element_op = CDEElementOp{};
// add device GEMM instances // add device op instances
std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmAddAddFastGeluPtr> const auto op_ptrs = ck::tensor_operation::device::device_gemm_instance::
device_op_ptrs; get_device_gemm_add_add_fastgelu_instances<ADataType,
BDataType,
if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> && AccDataType,
is_same_v<EDataType, half_t>) D0DataType,
{ D1DataType,
if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor> && EDataType,
is_same_v<BLayout, tensor_layout::gemm::RowMajor> && ALayout,
is_same_v<ELayout, tensor_layout::gemm::RowMajor>) BLayout,
{ D0Layout,
ck::tensor_operation::device::device_gemm_instance:: D1Layout,
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( ELayout>();
device_op_ptrs);
} std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
else if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor> &&
is_same_v<BLayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
device_op_ptrs);
}
else if constexpr(is_same_v<ALayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<BLayout, tensor_layout::gemm::RowMajor> &&
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
device_op_ptrs);
}
else if constexpr(is_same_v<ALayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<BLayout, tensor_layout::gemm::ColumnMajor> &&
is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
device_op_ptrs);
}
}
std::cout << "found " << device_op_ptrs.size() << " instances" << std::endl;
// run reference // run reference
if(do_verification) if(do_verification)
...@@ -207,7 +158,7 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -207,7 +158,7 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
d0_m_n_device_buf.ToDevice(d0_m_n.mData.data()); d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
d1_m_n_device_buf.ToDevice(d1_m_n.mData.data()); d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
std::string best_device_op_name; std::string best_op_name;
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
...@@ -215,14 +166,14 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -215,14 +166,14 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
bool pass = true; bool pass = true;
// profile device operation instances // profile device operation instances
for(auto& device_op_ptr : device_op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = device_op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(
a_device_buf.GetDeviceBuffer(), a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(), std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
d1_m_n_device_buf.GetDeviceBuffer()}, d1_m_n_device_buf.GetDeviceBuffer()},
static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()), e_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
...@@ -234,11 +185,11 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -234,11 +185,11 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
b_element_op, b_element_op,
cde_element_op); cde_element_op);
auto invoker_ptr = device_op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string device_op_name = device_op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
if(device_op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -256,14 +207,14 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -256,14 +207,14 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << device_op_name << std::endl; << gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
best_device_op_name = device_op_name; best_op_name = op_name;
best_tflops = tflops; best_tflops = tflops;
best_ave_time = ave_time; best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
} }
if(do_verification) if(do_verification)
...@@ -276,14 +227,14 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -276,14 +227,14 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
} }
else else
{ {
std::cout << device_op_name << " does not support this problem" << std::endl; std::cout << op_name << " does not support this problem" << std::endl;
} }
} }
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_device_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return pass ? 0 : 1; return pass;
} }
} // namespace profiler } // namespace profiler
......
This diff is collapsed.
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/device_gemm_splitk_instance.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace profiler {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename ALayout,
typename BLayout,
typename CLayout>
bool profile_gemm_splitk_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
int M,
int N,
int K,
int StrideA,
int StrideB,
int StrideC,
int KBatch)
{
bool pass = true;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
}
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
c_device_buf.ToDevice(c_m_n_device_result.mData.data());
// add device op instances
const auto op_ptrs =
ck::tensor_operation::device::device_gemm_instance::get_device_gemm_splitk_instances<
ADataType,
BDataType,
CDataType,
ALayout,
BLayout,
CLayout>();
if(op_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device operation instance found");
}
// Run reference GEMM
if(do_verification)
{
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
}
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device GEMM instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op,
KBatch);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init C to zero before profiling next kernel
c_device_buf.SetZero();
std::string op_name = op_ptr->GetTypeString();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass =
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", c_m_n_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
<< std::endl;
}
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
if constexpr(is_same<CDataType, float>::value)
{
std::cout << "Best Perf for datatype = f32";
}
else if constexpr(is_same<CDataType, half_t>::value)
{
std::cout << "Best Perf for datatype = f16";
}
else if constexpr(is_same<CDataType, bhalf_t>::value)
{
std::cout << "Best Perf for datatype = bf16";
}
else if constexpr(is_same<CDataType, int8_t>::value)
{
std::cout << "Best Perf for datatype = int8";
}
if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
{
std::cout << " ALayout = RowMajor";
}
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
{
std::cout << " ALayout = ColumnMajor";
}
if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
{
std::cout << " BLayout = RowMajor";
}
else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
{
std::cout << " BLayout = ColumnMajor";
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment