Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
4957d5a3
Commit
4957d5a3
authored
May 02, 2019
by
Chao Liu
Browse files
refactored
parent
4a99f54c
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
52 additions
and
76 deletions
+52
-76
src/include/ConstantMatrixDescriptor.hip.hpp
src/include/ConstantMatrixDescriptor.hip.hpp
+0
-4
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_batched_gemm.hip.hpp
src/include/blockwise_batched_gemm.hip.hpp
+1
-1
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+1
-1
src/include/common.hip.hpp
src/include/common.hip.hpp
+2
-2
src/include/config.h.in
src/include/config.h.in
+45
-0
src/include/vector_type.hip.hpp
src/include/vector_type.hip.hpp
+2
-67
No files found.
src/include/ConstantMatrixDescriptor.hip.hpp
View file @
4957d5a3
...
@@ -23,11 +23,7 @@ struct ConstantMatrixDescriptor
...
@@ -23,11 +23,7 @@ struct ConstantMatrixDescriptor
__host__
__device__
index_t
Get1dIndex
(
index_t
irow
,
index_t
icol
)
const
__host__
__device__
index_t
Get1dIndex
(
index_t
irow
,
index_t
icol
)
const
{
{
#if DEVICE_BACKEND_HIP
return
__mul24
(
irow
,
RowStride_
)
+
icol
;
#else
return
irow
*
RowStride_
+
icol
;
return
irow
*
RowStride_
+
icol
;
#endif
}
}
template
<
index_t
SubNRow
,
index_t
SubNCol
>
template
<
index_t
SubNRow
,
index_t
SubNCol
>
...
...
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
4957d5a3
...
@@ -668,7 +668,7 @@ struct Blockwise2dTensorCopy3
...
@@ -668,7 +668,7 @@ struct Blockwise2dTensorCopy3
}
}
}
}
#if
DEVICE_BACKEND_HIP
#if
USE_AMD_INLINE_ASM
__device__
void
RunLoadRegisterClipboard_asm
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegisterClipboard_asm
(
const
Float
*
__restrict__
p_src
,
Float
*
p_clipboard
)
const
Float
*
p_clipboard
)
const
{
{
...
...
src/include/blockwise_batched_gemm.hip.hpp
View file @
4957d5a3
...
@@ -283,7 +283,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -283,7 +283,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
}
}
}
}
#if
DEVICE_BACKEND_HIP
#if
USE_AMD_INLINE_ASM
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run_asm
(
const
FloatA
*
__restrict__
p_a_block
,
__device__
void
Run_asm
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
const
FloatB
*
__restrict__
p_b_block
,
...
...
src/include/blockwise_gemm.hip.hpp
View file @
4957d5a3
...
@@ -126,7 +126,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -126,7 +126,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
n_repeat
*
NPerLevel1Cluster
+
n_in_sub_c
};
n_repeat
*
NPerLevel1Cluster
+
n_in_sub_c
};
}
}
#if
DEVICE_BACKEND_HIP
#if
USE_AMD_INLINE_ASM
// TODO: this is not working correctly
// TODO: this is not working correctly
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run_asm
(
const
FloatA
*
__restrict__
p_a_block
,
__device__
void
Run_asm
(
const
FloatA
*
__restrict__
p_a_block
,
...
...
src/include/common.hip.hpp
View file @
4957d5a3
#pragma once
#pragma once
#include "
data
_type.hip.hpp"
#include "
vector
_type.hip.hpp"
#include "constant_integral.hip.hpp"
#include "constant_integral.hip.hpp"
#include "Sequence.hip.hpp"
#include "Sequence.hip.hpp"
#include "Array.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
#include "functional.hip.hpp"
#include "functional2.hip.hpp"
#include "functional2.hip.hpp"
#if
DEVICE_BACKEND_HIP
#if
USE_AMD_INLINE_ASM
#include "amd_inline_asm.hip.hpp"
#include "amd_inline_asm.hip.hpp"
#endif
#endif
...
...
src/include/config.h.in
View file @
4957d5a3
...
@@ -5,11 +5,56 @@
...
@@ -5,11 +5,56 @@
#if DEVICE_BACKEND_HIP
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#define USE_AMD_INLINE_ASM 1
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
#elif DEVICE_BACKEND_CUDA
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#define USE_AMD_INLINE_ASM 0
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using float2_t = float2;
using float4_t = float4;
#endif
#endif
using index_t = uint32_t;
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
// TODO:: this interface is misleading, s0, s1 are actually int8x4
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
#if DEVICE_BACKEND_CUDA
d = __dp4a(s0, s1, d);
#endif
}
#endif
src/include/
data
_type.hip.hpp
→
src/include/
vector
_type.hip.hpp
View file @
4957d5a3
...
@@ -23,17 +23,7 @@ struct vector_type<float, 1>
...
@@ -23,17 +23,7 @@ struct vector_type<float, 1>
template
<
>
template
<
>
struct
vector_type
<
float
,
2
>
struct
vector_type
<
float
,
2
>
{
{
#if DEVICE_BACKEND_HIP
using
MemoryType
=
float2_t
;
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
2
)));
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using
MemoryType
=
float2
;
#endif
union
Data
union
Data
{
{
...
@@ -60,17 +50,7 @@ struct vector_type<float, 2>
...
@@ -60,17 +50,7 @@ struct vector_type<float, 2>
template
<
>
template
<
>
struct
vector_type
<
float
,
4
>
struct
vector_type
<
float
,
4
>
{
{
#if DEVICE_BACKEND_HIP
using
MemoryType
=
float4_t
;
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef
float
MemoryType
__attribute__
((
ext_vector_type
(
4
)));
#elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using
MemoryType
=
float4
;
#endif
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
__host__
__device__
static
void
SetScalar
(
MemoryType
&
v
,
float
s
,
Number
<
I
>
)
...
@@ -204,48 +184,3 @@ struct vector_type<char4, 2>
...
@@ -204,48 +184,3 @@ struct vector_type<char4, 2>
using MemoryType = int64_t;
using MemoryType = int64_t;
};
};
#endif
#endif
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float
&
s0
,
const
float
&
s1
)
{
d
+=
s0
*
s1
;
}
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float2
&
s0
,
const
float2
&
s1
)
{
d
+=
s0
.
x
*
s1
.
x
;
d
+=
s0
.
y
*
s1
.
y
;
}
__device__
void
fused_multiply_accumulate
(
float
&
d
,
const
float4
&
s0
,
const
float4
&
s1
)
{
d
+=
s0
.
x
*
s1
.
x
;
d
+=
s0
.
y
*
s1
.
y
;
d
+=
s0
.
z
*
s1
.
z
;
d
+=
s0
.
w
*
s1
.
w
;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
// TODO:: this interface is misleading, s0, s1 are actually int8x4
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
#if DEVICE_BACKEND_CUDA
d = __dp4a(s0, s1, d);
#endif
}
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment