Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
21f7e9f1
"docs/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "e156331b0e57c92640cc6ac6725d4d9b5e1f567d"
Commit
21f7e9f1
authored
Jun 19, 2019
by
Chao Liu
Browse files
refactor
parent
9de63930
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
142 additions
and
130 deletions
+142
-130
CMakeLists.txt
CMakeLists.txt
+0
-4
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
...osable_kernel/include/tensor_operation/blockwise_gemm.hpp
+3
-3
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+1
-1
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+4
-7
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+2
-3
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+2
-9
composable_kernel/include/utility/integral_constant.hpp
composable_kernel/include/utility/integral_constant.hpp
+21
-3
composable_kernel/include/utility/math.hpp
composable_kernel/include/utility/math.hpp
+107
-0
composable_kernel/include/utility/utility.hpp
composable_kernel/include/utility/utility.hpp
+2
-100
No files found.
CMakeLists.txt
View file @
21f7e9f1
...
@@ -33,13 +33,9 @@ endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
...
@@ -33,13 +33,9 @@ endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
#GPU backend
#GPU backend
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
set
(
CK_DEVICE_BACKEND_AMD 1
)
set
(
CMAKE_MODULE_PATH
"/opt/rocm/hip/cmake"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"/opt/rocm/hip/cmake"
${
CMAKE_MODULE_PATH
}
)
find_package
(
HIP REQUIRED
)
find_package
(
HIP REQUIRED
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
set
(
CK_DEVICE_BACKEND_NVIDIA 1
)
enable_language
(
CUDA
)
enable_language
(
CUDA
)
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
endif
()
endif
()
...
...
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
View file @
21f7e9f1
...
@@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
"wrong! Cannot evenly divide work among
\n
"
);
static_assert
(
std
::
is_same
<
decltype
(
ThreadMatrixC
::
GetLengths
()),
static_assert
(
decltype
(
GetThreadMatrixCLengths
())
>
{},
is_same
<
decltype
(
ThreadMatrixC
::
GetLengths
()),
decltype
(
GetThreadMatrixCLengths
())
>
{},
"wrong! ThreadMatrixC lengths is wrong"
);
"wrong! ThreadMatrixC lengths is wrong"
);
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
21f7e9f1
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)
[[
hc
]]
;
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
__device__
void
vmcnt
(
index_t
cnt
)
__device__
void
vmcnt
(
index_t
cnt
)
{
{
...
...
composable_kernel/include/utility/common_header.hpp
View file @
21f7e9f1
#ifndef CK_COMMON_HPP
#ifndef CK_COMMON_
HEADER_
HPP
#define CK_COMMON_HPP
#define CK_COMMON_
HEADER_
HPP
#include "config.hpp"
#include "config.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "utility.hpp"
#include "utility.hpp"
#include "vector_type.hpp"
#include "vector_type.hpp"
#include "integral_constant.hpp"
#include "Sequence.hpp"
#include "Sequence.hpp"
#include "Array.hpp"
#include "Array.hpp"
#include "functional.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
#include "functional3.hpp"
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#endif
#endif
#endif
composable_kernel/include/utility/config_amd.hpp.in
View file @
21f7e9f1
#ifndef CK_CONFIG_AMD_HPP
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#cmakedefine01 CK_DEVICE_BACKEND_AMD
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
21f7e9f1
#ifndef CK_CONFIG_NVIDIA_HPP
#ifndef CK_CONFIG_NVIDIA_HPP
#define CK_CONFIG_NVIDIA_HPP
#define CK_CONFIG_NVIDIA_HPP
#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#define CK_USE_AMD_INLINE_ASM 0
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
...
@@ -23,10 +22,6 @@ using float4_t = float4;
...
@@ -23,10 +22,6 @@ using float4_t = float4;
using index_t = uint32_t;
using index_t = uint32_t;
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
{
d += s0 * s1;
d += s0 * s1;
...
@@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
...
@@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
// need to make a better interface
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
{
#if CK_DEVICE_BACKEND_NVIDIA
d = __dp4a(s0, s1, d);
d = __dp4a(s0, s1, d);
#endif
}
}
#endif
#endif
...
...
composable_kernel/include/utility/integral_constant.hpp
View file @
21f7e9f1
#ifndef CK_INTEGRAL_CONSTANT_HPP
#ifndef CK_INTEGRAL_CONSTANT_HPP
#define CK_INTEGRAL_CONSTANT_HPP
#define CK_INTEGRAL_CONSTANT_HPP
#include <type_traits>
namespace
ck
{
namespace
ck
{
template
<
class
T
,
T
v
>
template
<
class
T
,
T
v
>
using
integral_constant
=
std
::
integral_constant
<
T
,
v
>
;
struct
integral_constant
{
static
constexpr
T
value
=
v
;
typedef
T
value_type
;
typedef
integral_constant
type
;
// using injected-class-name
__host__
__device__
constexpr
operator
value_type
()
const
noexcept
{
return
value
;
}
__host__
__device__
constexpr
value_type
operator
()()
const
noexcept
{
return
value
;
}
// since c++14
};
template
<
class
T
,
T
X
,
T
Y
>
template
<
class
T
,
T
X
,
T
Y
>
__host__
__device__
constexpr
auto
operator
+
(
integral_constant
<
T
,
X
>
,
integral_constant
<
T
,
Y
>
)
__host__
__device__
constexpr
auto
operator
+
(
integral_constant
<
T
,
X
>
,
integral_constant
<
T
,
Y
>
)
...
@@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_c
...
@@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_c
template
<
index_t
N
>
template
<
index_t
N
>
using
Number
=
integral_constant
<
index_t
,
N
>
;
using
Number
=
integral_constant
<
index_t
,
N
>
;
template
<
class
X
,
class
Y
>
struct
is_same
:
public
integral_constant
<
bool
,
false
>
{
};
template
<
class
X
>
struct
is_same
<
X
,
X
>
:
public
integral_constant
<
bool
,
true
>
{
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/math.hpp
0 → 100644
View file @
21f7e9f1
#ifndef CK_MATH_HPP
#define CK_MATH_HPP
#include "config.hpp"
namespace
ck
{
namespace
math
{
template
<
class
T
,
T
s
>
struct
scales
{
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
};
template
<
class
T
>
struct
plus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
>
struct
minus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
-
b
;
}
};
template
<
class
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
>
struct
integer_divide_ceiler
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
};
template
<
class
T
>
__host__
__device__
constexpr
T
integer_divide_ceil
(
T
a
,
T
b
)
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
integer_least_multiple
(
T
a
,
T
b
)
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
b
*
integer_divide_ceil
(
a
,
b
);
}
template
<
class
T
>
__host__
__device__
constexpr
T
max
(
T
x
)
{
return
x
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
max
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
max
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>
{},
"not the same type"
);
return
x
>
y
?
x
:
y
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
min
(
T
x
)
{
return
x
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
min
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
min
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>
{},
"not the same type"
);
return
x
<
y
?
x
:
y
;
}
// this is WRONG
// TODO: implement least common multiple properly, instead of calling max()
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
lcm
(
T
x
,
Ts
...
xs
)
{
return
max
(
x
,
xs
...);
}
}
// namespace math
}
// namspace ck
#endif
composable_kernel/include/utility/utility.hpp
View file @
21f7e9f1
#ifndef CK_UTILITY_HPP
#ifndef CK_UTILITY_HPP
#define CK_UTILITY_HPP
#define CK_UTILITY_HPP
#include <type_traits>
#include "config.hpp"
#include "config.hpp"
namespace
ck
{
namespace
ck
{
template
<
class
X
,
class
Y
>
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
using
is_same
=
std
::
is_same
<
X
,
Y
>
;
namespace
math
{
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
template
<
class
T
,
T
s
>
struct
scales
{
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
};
template
<
class
T
>
struct
plus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
>
struct
minus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
-
b
;
}
};
template
<
class
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
>
struct
integer_divide_ceiler
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
};
template
<
class
T
>
__host__
__device__
constexpr
T
integer_divide_ceil
(
T
a
,
T
b
)
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
integer_least_multiple
(
T
a
,
T
b
)
{
static_assert
(
is_same
<
T
,
index_t
>
{}
||
is_same
<
T
,
int
>
{},
"wrong type"
);
return
b
*
integer_divide_ceil
(
a
,
b
);
}
template
<
class
T
>
__host__
__device__
constexpr
T
max
(
T
x
)
{
return
x
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
max
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
max
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>
{},
"not the same type"
);
return
x
>
y
?
x
:
y
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
min
(
T
x
)
{
return
x
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
min
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
min
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>
{},
"not the same type"
);
return
x
<
y
?
x
:
y
;
}
// this is WRONG
// TODO: implement least common multiple properly, instead of calling max()
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
lcm
(
T
x
,
Ts
...
xs
)
{
return
max
(
x
,
xs
...);
}
}
// namespace math
}
// namspace ck
}
// namspace ck
#endif
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment