Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2a87a973
Commit
2a87a973
authored
Feb 20, 2021
by
Chao Liu
Browse files
use raw buffer intrinsic
parent
079b745d
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
88 deletions
+72
-88
composable_kernel/include/gridwise_operation_wrapper.hpp
composable_kernel/include/gridwise_operation_wrapper.hpp
+2
-2
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+1
-11
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+67
-66
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+0
-7
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-2
No files found.
composable_kernel/include/gridwise_operation_wrapper.hpp
View file @
2a87a973
...
...
@@ -3,10 +3,10 @@
template
<
typename
GridwiseOp
,
typename
...
Xs
>
__global__
void
#if
1
#if
0
__launch_bounds__(256, 2)
#endif
run_gridwise_operation
(
Xs
...
xs
)
run_gridwise_operation
(
Xs
...
xs
)
{
GridwiseOp
{}.
Run
(
xs
...);
}
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
2a87a973
...
...
@@ -2,20 +2,10 @@
#define CK_AMD_BUFFER_ADDRESSING_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing_v2.hpp"
namespace
ck
{
// For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template
<
typename
T
>
union
BufferResourceConstant
{
int32x4_t
data
;
T
*
address
[
2
];
int32_t
range
[
4
];
int32_t
config
[
4
];
};
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
...
...
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
View file @
2a87a973
...
...
@@ -2,11 +2,9 @@
#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace
ck
{
#if 0
// For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template
<
typename
T
>
...
...
@@ -17,27 +15,45 @@ union BufferResourceConstant
int32_t
range
[
4
];
int32_t
config
[
4
];
};
#endif
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f32"
);
__device__
float
__llvm_amdgcn_raw_buffer_load_fp32
(
int32x4_t
srsrc
,
index_t
v
offset
,
index_t
soffset
,
index_t
glc_
slc
)
__asm
(
"llvm.amdgcn.
raw.
buffer.load.f32"
);
__device__
float2_t
__llvm_amdgcn_buffer_load_f32x2
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f32"
);
__llvm_amdgcn_raw_buffer_load_fp32x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2f32"
);
__device__
float4_t
__llvm_amdgcn_buffer_load_f32x4
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f32"
);
__llvm_amdgcn_raw_buffer_load_fp32x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4f32"
);
__device__
void
__llvm_amdgcn_raw_buffer_store_fp32
(
float
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.f32"
);
__device__
void
__llvm_amdgcn_raw_buffer_store_fp32x2
(
float2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2f32"
);
__device__
void
__llvm_amdgcn_raw_buffer_store_fp32x4
(
float4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4f32"
);
// buffer_load requires:
// 1) p_src_wave must be in global memory space
...
...
@@ -82,11 +98,11 @@ __device__ float amd_buffer_load_v2<float, 1>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_
raw_
buffer_load_f
p
32
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
#else
float
tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
float
tmp
=
__llvm_amdgcn_
raw_
buffer_load_f
p
32
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
,
0
,
0
);
return
src_thread_data_valid
?
tmp
:
float
(
0
);
#endif
...
...
@@ -112,11 +128,11 @@ __device__ float2_t amd_buffer_load_v2<float, 2>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_
raw_
buffer_load_f
p
32x2
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
#else
float2_t
tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
float2_t
tmp
=
__llvm_amdgcn_
raw_
buffer_load_f
p
32x2
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
,
0
,
0
);
return
src_thread_data_valid
?
tmp
:
float2_t
(
0
);
#endif
...
...
@@ -142,11 +158,11 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_
raw_
buffer_load_f
p
32x4
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
#else
float4_t
tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
float4_t
tmp
=
__llvm_amdgcn_
raw_
buffer_load_f
p
32x4
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
,
0
,
0
);
return
src_thread_data_valid
?
tmp
:
float4_t
(
0
);
#endif
...
...
@@ -173,21 +189,16 @@ __device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
__llvm_amdgcn_
raw_
buffer_store_f
p
32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
0
,
0
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
__llvm_amdgcn_buffer_store_fp32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
dst_thread_addr_offset
,
0
,
0
);
}
#endif
}
...
...
@@ -213,21 +224,16 @@ __device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
__llvm_amdgcn_
raw_
buffer_store_f
p
32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
0
,
0
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
__llvm_amdgcn_raw_buffer_store_fp32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
dst_thread_addr_offset
,
0
,
0
);
}
#endif
}
...
...
@@ -253,21 +259,16 @@ __device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
__llvm_amdgcn_
raw_
buffer_store_f
p
32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
0
,
0
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
__llvm_amdgcn_raw_buffer_store_fp32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
dst_thread_addr_offset
,
0
,
0
);
}
#endif
}
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
2a87a973
...
...
@@ -11,9 +11,6 @@
#define CK_HIP_VERSION_FLAT 0
#endif
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
...
...
@@ -117,11 +114,7 @@ enum InMemoryDataOperation
AtomicAdd
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
using index_t = int32_t;
#endif
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
2a87a973
...
...
@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#elif
0
#elif
1
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -235,7 +235,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
auto
conv_driver
=
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif
0
#elif
1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
#elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment