Unverified Commit d1db6a0c authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Absolute include path (#281)

* ad gelu and fast_gelu

* added GeLU and fast GeLU

* clean up

* add gemm+fastgelu example

* add gemm+gelu instances

* update profiler

* clean up

* clean up

* adding gemm+bias+activation

* clean

* adding bias

* clean

* adding gemm multiple d

* debugging

* add gemm bias add fastgelu

* rename, clean

* refactoring; add readme

* refactor

* refactor

* refactor

* refactor

* refactor

* refactor

* fix

* fix

* update example

* update example

* rename

* update example

* add ckProfiler

* clean

* clean

* clean

* clean

* add client app example

* update readme

* delete obselete files

* remove old client app

* delete old file

* cleaning

* clean

* remove half

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path for all examples

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path

* fix header path

* revert client app example

* clean build

* fix build

* temporary disable client test on Jenkins

* clean

* clean

* clean
parent a49115b9
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_blockwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -53,7 +53,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); ...@@ -53,7 +53,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_blockwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -40,7 +40,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); ...@@ -40,7 +40,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_blockwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -28,7 +28,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); ...@@ -28,7 +28,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
#include "device_reduce_instance_blockwise.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -51,7 +52,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); ...@@ -51,7 +52,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
#include "device_reduce_instance_blockwise.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -27,7 +28,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); ...@@ -27,7 +28,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
#include "device_reduce_instance_blockwise.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -51,7 +52,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); ...@@ -51,7 +52,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
#include "device_reduce_instance_blockwise.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +24,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); ...@@ -23,7 +24,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#include "device_reduce_instance_blockwise.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -39,7 +40,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); ...@@ -39,7 +40,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -35,7 +34,4 @@ struct ReductionConfiguration_2 ...@@ -35,7 +34,4 @@ struct ReductionConfiguration_2
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
#include "reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "device_reduce_multiblock.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -193,7 +193,4 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -193,7 +193,4 @@ void add_device_reduce_instance_multiblock_atomic_add(
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -24,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); ...@@ -24,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_multiblock_atomic_add.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -24,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); ...@@ -24,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
#include "device_reduce_instance_multiblock_atomic_add.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ...@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
#include "device_reduce_instance_multiblock_atomic_add.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); ...@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
#include "device_reduce_instance_multiblock_atomic_add.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ...@@ -23,7 +24,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
#include "reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
#include "device_reduce_threadwise.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -152,7 +151,4 @@ void add_device_reduce_instance_threadwise( ...@@ -152,7 +151,4 @@ void add_device_reduce_instance_threadwise(
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -53,7 +53,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); ...@@ -53,7 +53,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -40,7 +40,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); ...@@ -40,7 +40,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
#include "device_reduce_instance_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -28,7 +28,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); ...@@ -28,7 +28,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP #pragma once
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#include "device_reduce_instance_threadwise.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -51,7 +52,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); ...@@ -51,7 +52,4 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
} // namespace device_reduce_instance } // namespace device_reduce_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment