Absolute include path (#281)

* ad gelu and fast_gelu * added GeLU and fast GeLU * clean up * add gemm+fastgelu example * add gemm+gelu instances * update profiler * clean up * clean up * adding gemm+bias+activation * clean * adding bias * clean * adding gemm multiple d * debugging * add gemm bias add fastgelu * rename, clean * refactoring; add readme * refactor * refactor * refactor * refactor * refactor * refactor * fix * fix * update example * update example * rename * update example * add ckProfiler * clean * clean * clean * clean * add client app example * update readme * delete obselete files * remove old client app * delete old file * cleaning * clean * remove half * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path for all examples * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * revert client app example * clean build * fix build * temporary disable client test on Jenkins * clean * clean * clean

Absolute include path (#281)
* ad gelu and fast_gelu * added GeLU and fast GeLU * clean up * add gemm+fastgelu example * add gemm+gelu instances * update profiler * clean up * clean up * adding gemm+bias+activation * clean * adding bias * clean * adding gemm multiple d * debugging * add gemm bias add fastgelu * rename, clean * refactoring; add readme * refactor * refactor * refactor * refactor * refactor * refactor * fix * fix * update example * update example * rename * update example * add ckProfiler * clean * clean * clean * clean * add client app example * update readme * delete obselete files * remove old client app * delete old file * cleaning * clean * remove half * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path for all examples * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * revert client app example * clean build * fix build * temporary disable client test on Jenkins * clean * clean * clean
d1db6a0c · Chao Liu · GitHub · a49115b9 · d1db6a0c · d1db6a0c
Unverified Commit d1db6a0c authored Jun 24, 2022 by Chao Liu Committed by GitHub Jun 24, 2022
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -53,7 +53,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -40,7 +40,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -28,7 +28,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +52,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -27,7 +28,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +52,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +24,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -39,7 +40,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
 namespace ck {
 namespace tensor_operation {
@@ -35,7 +34,4 @@ struct ReductionConfiguration_2
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
-#include "reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-#include "device_reduce_instance_impl_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
-#include "device_reduce_multiblock.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -193,7 +193,4 @@ void add_device_reduce_instance_multiblock_atomic_add(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -24,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -24,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
-#include "reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-#include "device_reduce_instance_impl_common.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
-#include "device_reduce_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -152,7 +151,4 @@ void add_device_reduce_instance_threadwise(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -53,7 +53,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -40,7 +40,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
-#include "data_type.hpp"
+#include "ck/utility/data_type.hpp"
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -28,7 +28,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
+#pragma once
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +52,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif