Commit 21f7e9f1 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 9de63930
...@@ -33,13 +33,9 @@ endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) ...@@ -33,13 +33,9 @@ endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
#GPU backend #GPU backend
if(DEVICE_BACKEND STREQUAL "AMD") if(DEVICE_BACKEND STREQUAL "AMD")
set(CK_DEVICE_BACKEND_AMD 1)
set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED) find_package(HIP REQUIRED)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CK_DEVICE_BACKEND_NVIDIA 1)
enable_language(CUDA) enable_language(CUDA)
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
endif() endif()
......
...@@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
"wrong! Cannot evenly divide work among\n"); "wrong! Cannot evenly divide work among\n");
static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()), static_assert(
decltype(GetThreadMatrixCLengths())>{}, is_same<decltype(ThreadMatrixC::GetLengths()), decltype(GetThreadMatrixCLengths())>{},
"wrong! ThreadMatrixC lengths is wrong"); "wrong! ThreadMatrixC lengths is wrong");
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
namespace ck { namespace ck {
// cast a pointer of LDS to its address // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
__device__ void vmcnt(index_t cnt) __device__ void vmcnt(index_t cnt)
{ {
......
#ifndef CK_COMMON_HPP #ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HPP #define CK_COMMON_HEADER_HPP
#include "config.hpp" #include "config.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "utility.hpp" #include "utility.hpp"
#include "vector_type.hpp" #include "vector_type.hpp"
#include "integral_constant.hpp"
#include "Sequence.hpp" #include "Sequence.hpp"
#include "Array.hpp" #include "Array.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "functional3.hpp" #include "functional3.hpp"
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#endif
#endif #endif
#ifndef CK_CONFIG_AMD_HPP #ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP #define CK_CONFIG_AMD_HPP
#cmakedefine01 CK_DEVICE_BACKEND_AMD
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
......
#ifndef CK_CONFIG_NVIDIA_HPP #ifndef CK_CONFIG_NVIDIA_HPP
#define CK_CONFIG_NVIDIA_HPP #define CK_CONFIG_NVIDIA_HPP
#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "cuda_fp16.h" #include "cuda_fp16.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#define CK_USE_AMD_INLINE_ASM 0
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
...@@ -23,10 +22,6 @@ using float4_t = float4; ...@@ -23,10 +22,6 @@ using float4_t = float4;
using index_t = uint32_t; using index_t = uint32_t;
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{ {
d += s0 * s1; d += s0 * s1;
...@@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s ...@@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
// need to make a better interface // need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{ {
#if CK_DEVICE_BACKEND_NVIDIA
d = __dp4a(s0, s1, d); d = __dp4a(s0, s1, d);
#endif
} }
#endif #endif
......
#ifndef CK_INTEGRAL_CONSTANT_HPP #ifndef CK_INTEGRAL_CONSTANT_HPP
#define CK_INTEGRAL_CONSTANT_HPP #define CK_INTEGRAL_CONSTANT_HPP
#include <type_traits>
namespace ck { namespace ck {
template <class T, T v> template <class T, T v>
using integral_constant = std::integral_constant<T, v>; struct integral_constant
{
static constexpr T value = v;
typedef T value_type;
typedef integral_constant type; // using injected-class-name
__host__ __device__ constexpr operator value_type() const noexcept { return value; }
__host__ __device__ constexpr value_type operator()() const noexcept
{
return value;
} // since c++14
};
template <class T, T X, T Y> template <class T, T X, T Y>
__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>) __host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
...@@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_c ...@@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_c
template <index_t N> template <index_t N>
using Number = integral_constant<index_t, N>; using Number = integral_constant<index_t, N>;
template <class X, class Y>
struct is_same : public integral_constant<bool, false>
{
};
template <class X>
struct is_same<X, X> : public integral_constant<bool, true>
{
};
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_MATH_HPP
#define CK_MATH_HPP
#include "config.hpp"
namespace ck {
namespace math {
template <class T, T s>
struct scales
{
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
};
template <class T>
struct plus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
};
template <class T>
struct minus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
};
template <class T>
struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
template <class T>
struct integer_divide_ceiler
{
__host__ __device__ constexpr T operator()(T a, T b) const
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b;
}
};
template <class T>
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b;
}
template <class T>
__host__ __device__ constexpr T integer_least_multiple(T a, T b)
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return b * integer_divide_ceil(a, b);
}
template <class T>
__host__ __device__ constexpr T max(T x)
{
return x;
}
template <class T, class... Ts>
__host__ __device__ constexpr T max(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = max(xs...);
static_assert(is_same<decltype(y), T>{}, "not the same type");
return x > y ? x : y;
}
template <class T>
__host__ __device__ constexpr T min(T x)
{
return x;
}
template <class T, class... Ts>
__host__ __device__ constexpr T min(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = min(xs...);
static_assert(is_same<decltype(y), T>{}, "not the same type");
return x < y ? x : y;
}
// this is WRONG
// TODO: implement least common multiple properly, instead of calling max()
template <class T, class... Ts>
__host__ __device__ constexpr T lcm(T x, Ts... xs)
{
return max(x, xs...);
}
} // namespace math
} // namspace ck
#endif
#ifndef CK_UTILITY_HPP #ifndef CK_UTILITY_HPP
#define CK_UTILITY_HPP #define CK_UTILITY_HPP
#include <type_traits>
#include "config.hpp" #include "config.hpp"
namespace ck { namespace ck {
template <class X, class Y> __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
using is_same = std::is_same<X, Y>;
namespace math { __device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class T, T s>
struct scales
{
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
};
template <class T>
struct plus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
};
template <class T>
struct minus
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
};
template <class T>
struct multiplies
{
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
};
template <class T>
struct integer_divide_ceiler
{
__host__ __device__ constexpr T operator()(T a, T b) const
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b;
}
};
template <class T>
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b;
}
template <class T>
__host__ __device__ constexpr T integer_least_multiple(T a, T b)
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return b * integer_divide_ceil(a, b);
}
template <class T>
__host__ __device__ constexpr T max(T x)
{
return x;
}
template <class T, class... Ts>
__host__ __device__ constexpr T max(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = max(xs...);
static_assert(is_same<decltype(y), T>{}, "not the same type");
return x > y ? x : y;
}
template <class T>
__host__ __device__ constexpr T min(T x)
{
return x;
}
template <class T, class... Ts>
__host__ __device__ constexpr T min(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = min(xs...);
static_assert(is_same<decltype(y), T>{}, "not the same type");
return x < y ? x : y;
}
// this is WRONG
// TODO: implement least common multiple properly, instead of calling max()
template <class T, class... Ts>
__host__ __device__ constexpr T lcm(T x, Ts... xs)
{
return max(x, xs...);
}
} // namespace math
} // namspace ck } // namspace ck
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment