Absolute include path (#281)

* ad gelu and fast_gelu * added GeLU and fast GeLU * clean up * add gemm+fastgelu example * add gemm+gelu instances * update profiler * clean up * clean up * adding gemm+bias+activation * clean * adding bias * clean * adding gemm multiple d * debugging * add gemm bias add fastgelu * rename, clean * refactoring; add readme * refactor * refactor * refactor * refactor * refactor * refactor * fix * fix * update example * update example * rename * update example * add ckProfiler * clean * clean * clean * clean * add client app example * update readme * delete obselete files * remove old client app * delete old file * cleaning * clean * remove half * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path for all examples * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * revert client app example * clean build * fix build * temporary disable client test on Jenkins * clean * clean * clean

Absolute include path (#281)
* ad gelu and fast_gelu * added GeLU and fast GeLU * clean up * add gemm+fastgelu example * add gemm+gelu instances * update profiler * clean up * clean up * adding gemm+bias+activation * clean * adding bias * clean * adding gemm multiple d * debugging * add gemm bias add fastgelu * rename, clean * refactoring; add readme * refactor * refactor * refactor * refactor * refactor * refactor * fix * fix * update example * update example * rename * update example * add ckProfiler * clean * clean * clean * clean * add client app example * update readme * delete obselete files * remove old client app * delete old file * cleaning * clean * remove half * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path for all examples * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * revert client app example * clean build * fix build * temporary disable client test on Jenkins * clean * clean * clean
d1db6a0c · Chao Liu · GitHub · a49115b9 · d1db6a0c · d1db6a0c
Unverified Commit d1db6a0c authored Jun 24, 2022 by Chao Liu Committed by GitHub Jun 24, 2022
20 changed files
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -4,20 +4,15 @@
 #include <cstdlib>
 #include <getopt.h>
-#include "check_err.hpp"
+#include "ck/ck.hpp"
-#include "config.hpp"
+#include "ck/utility/reduction_enums.hpp"
-#include "print.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-#include "device.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
+#include "ck/library/utility/check_err.hpp"
-#include "device_tensor.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
-#include "device_base.hpp"
+#include "ck/library/host_tensor/host_common_util.hpp"
-#include "device_softmax.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
-#include "host_common_util.hpp"
-#include "reference_softmax.hpp"
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 using namespace ck;
 using namespace ck::tensor_operation::device;

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
 include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}/include/ck
+    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/library/include
-    ${PROJECT_SOURCE_DIR}/include/ck/host_utility
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor
-    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
-    ${PROJECT_SOURCE_DIR}/external/include/half
 )
 add_custom_target(examples)

--- a/external/include/half/half.hpp
+++ b/external/include/half/half.hpp
--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#ifndef CK_CONFIG_AMD_HPP
+#pragma once
-#define CK_CONFIG_AMD_HPP
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
+#define CK_TIME_KERNEL 1
 // constant address space for kernel parameter
 // https://llvm.org/docs/AMDGPUUsage.html#address-spaces
 #define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
@@ -152,6 +153,7 @@ enum struct InMemoryDataOperationEnum
    Add
 };
+// FIXME: use regular Sequence and remove this
 template <InMemoryDataOperationEnum... Is>
 struct InMemoryDataOperationEnumSequence
 {
@@ -165,6 +167,7 @@ struct InMemoryDataOperationEnumSequence
    }
 };
+#if 0
 // TODO: no longer needed, remove this
 enum struct ActivTypeEnum
 {
@@ -172,10 +175,10 @@ enum struct ActivTypeEnum
    LeakyRelu,
    Sigmoid
 };
+#endif
 // index type
 using index_t      = int32_t;
 using long_index_t = int64_t;
 } // namespace ck
-#endif
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -2,6 +2,7 @@
 #include <string>
 #include <map>
+#include <hip/hip_runtime.h>
 namespace ck {

--- a/include/ck/device_utility/hip_check_error.hpp
+++ b/include/ck/device_utility/hip_check_error.hpp
+#pragma once
+#include <hip/hip_runtime.h>
+inline void hip_check_error(hipError_t x)
+{
+    if(x != hipSuccess)
+    {
+        std::ostringstream ss;
+        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
+           << "in function: " << __func__;
+        throw std::runtime_error(ss.str());
+    }
+}
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
 #pragma once
-#include <memory>
-#include <functional>
-#include <thread>
-#include <chrono>
 #include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include "stream_config.hpp"
+#include "ck/ck.hpp"
-#include "ck/options.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/device_utility/hip_check_error.hpp"
-template <typename T>
-__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
-{
-    for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
-    {
-        p[i] = x;
-    }
-}
-inline void hip_check_error(hipError_t x)
-{
-    if(x != hipSuccess)
-    {
-        std::ostringstream ss;
-        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
-           << "in function: " << __func__;
-        throw std::runtime_error(ss.str());
-    }
-}
-struct DeviceMem
-{
-    DeviceMem() = delete;
-    DeviceMem(std::size_t mem_size);
-    void* GetDeviceBuffer();
-    std::size_t GetBufferSize();
-    void ToDevice(const void* p);
-    void FromDevice(void* p);
-    void SetZero();
-    template <typename T>
-    void SetValue(T x)
-    {
-        if(mMemSize % sizeof(T) != 0)
-        {
-            throw std::runtime_error("wrong! not entire DeviceMem will be set");
-        }
-        set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
-    }
-    ~DeviceMem();
-    void* mpDeviceBuf;
-    std::size_t mMemSize;
-};
-struct KernelTimerImpl;
-struct KernelTimer
-{
-    KernelTimer();
-    ~KernelTimer();
-    void Start();
-    void End();
-    float GetElapsedTime() const;
-    std::unique_ptr<KernelTimerImpl> impl;
-};
 template <typename... Args, typename F>
 float launch_and_time_kernel(const StreamConfig& stream_config,
@@ -97,17 +35,27 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        printf("Start running %d times...\n", nrepeat);
-        KernelTimer timer;
+        hipEvent_t start, stop;
-        timer.Start();
+        hip_check_error(hipEventCreate(&start));
+        hip_check_error(hipEventCreate(&stop));
+        hip_check_error(hipDeviceSynchronize());
+        hip_check_error(hipEventRecord(start, stream_config.stream_id_));
        for(int i = 0; i < nrepeat; ++i)
        {
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
        }
-        timer.End();
+        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+        hip_check_error(hipEventSynchronize(stop));
+        float total_time = 0;
+        hip_check_error(hipEventElapsedTime(&total_time, start, stop));
-        return timer.GetElapsedTime() / nrepeat;
+        return total_time / nrepeat;
    }
    else
    {

--- a/include/ck/options.hpp
+++ b/include/ck/options.hpp
-#pragma once
-#define CK_TIME_KERNEL 1
--- a/include/ck/tensor_description/cluster_descriptor.hpp
+++ b/include/ck/tensor_description/cluster_descriptor.hpp
-#ifndef CK_CLUSTER_DESCRIPTOR_HPP
+#pragma once
-#define CK_CLUSTER_DESCRIPTOR_HPP
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
@@ -30,4 +29,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
-#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
+#pragma once
-#define CK_MULTI_INDEX_TRANSFORM_HPP
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index.hpp"
+#include "ck/utility/multi_index.hpp"
 namespace ck {
@@ -1950,4 +1949,3 @@ struct Modulo
    }
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/multi_index_transform_helper.hpp
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
-#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#pragma once
-#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index_transform.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"
 namespace ck {
@@ -126,4 +125,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
    return Modulo<Modulus, UpLength>{modulus, up_length};
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
-#ifndef CK_TENSOR_ADAPTOR_HPP
+#pragma once
-#define CK_TENSOR_ADAPTOR_HPP
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
 namespace ck {
@@ -478,4 +477,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
-#ifndef CK_TENSOR_DESCRIPTOR_HPP
+#pragma once
-#define CK_TENSOR_DESCRIPTOR_HPP
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index_transform.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"
 namespace ck {
@@ -604,4 +603,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
    TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}));
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_descriptor_helper.hpp
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
 namespace ck {

--- a/include/ck/utility/tensor_space_filling_curve.hpp
+++ b/include/ck/utility/tensor_space_filling_curve.hpp
-#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
+#pragma once
-#define TENSOR_SPACE_FILLING_CURVE_HPP
-#include "math.hpp"
+#include "ck/utility/math.hpp"
-#include "sequence.hpp"
+#include "ck/utility/sequence.hpp"
-#include "sequence_helper.hpp"
+#include "ck/utility/sequence_helper.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/utility/statically_indexed_array_multi_index.hpp"
-#include "statically_indexed_array_multi_index.hpp"
+#include "ck/utility/tuple_helper.hpp"
-#include "tuple_helper.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {
@@ -156,4 +155,3 @@ struct SpaceFillingCurve
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
 #pragma once
-#include "common_header.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
-#include "threadwise_contraction_dl.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
 #pragma once
-#include "common_header.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"
-#include "xdlops_gemm.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
-#include "thread_group.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
 namespace ck {

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
-#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
+#pragma once
-#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
-#include "common_header.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
-#include "threadwise_tensor_slice_transfer_v5r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
 namespace ck {
@@ -152,4 +151,3 @@ struct BlockwiseTensorSliceTransfer_v5r1
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
-/*******************************************************************************
+#pragma once
- *
- * MIT License
+#include "ck/tensor_description/cluster_descriptor.hpp"
- *
+#include "ck/utility/reduction_common.hpp"
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
+#include "ck/utility/reduction_functions_accumulate.hpp"
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
-#include "reduction_common.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "cluster_descriptor.hpp"
 namespace ck {
@@ -193,6 +166,4 @@ struct PartitionedBlockwiseReductionWithIndex
    };
 };
-}; // end of namespace ck
+} // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
+#include "ck/utility/common_header.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "cluster_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "threadwise_tensor_slice_transfer_v3r1.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
 namespace ck {