Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c47159b0
Commit
c47159b0
authored
Dec 01, 2023
by
aska-0096
Browse files
double __restrict__ lds pointer w/a success
parent
a6ba3d2a
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
53 additions
and
32 deletions
+53
-32
example/01_gemm/common.hpp
example/01_gemm/common.hpp
+1
-1
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+6
-2
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+24
-8
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+1
-1
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+1
-1
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1_doublelds.hpp
...tion/gpu/grid/gridwise_gemm_xdl_cshuffle_v1_doublelds.hpp
+13
-12
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+1
-1
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+1
-1
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+5
-5
No files found.
example/01_gemm/common.hpp
View file @
c47159b0
...
...
@@ -50,7 +50,7 @@ struct ExecutionConfig final
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
time_kernel
=
0
;
};
template
<
ck
::
index_t
...
Is
>
...
...
example/01_gemm/run_gemm_example.inc
View file @
c47159b0
...
...
@@ -162,7 +162,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
return
true
;
}
#endif
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
,
0
,
300
,
3000
});
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
...
...
@@ -178,7 +178,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
if
(
config
.
time_kernel
==
1
)
std
::
cout
<<
"Perf mode Mean: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
else
if
(
config
.
time_kernel
==
2
)
std
::
cout
<<
"Perf mode Median: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
});
...
...
include/ck/host_utility/kernel_launch.hpp
View file @
c47159b0
...
...
@@ -30,7 +30,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
block_dim
.
y
,
block_dim
.
z
);
printf
(
"Warm up
1
time
\n
"
);
printf
(
"Warm up
%d
time
\n
"
,
stream_config
.
cold_niters_
);
#endif
// warm up
for
(
int
i
=
0
;
i
<
stream_config
.
cold_niters_
;
++
i
)
...
...
@@ -48,23 +48,39 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
hip_check_error
(
hipEventCreate
(
&
start
));
hip_check_error
(
hipEventCreate
(
&
stop
));
hip_check_error
(
hipDeviceSynchronize
());
hip_check_error
(
hipEventRecord
(
start
,
stream_config
.
stream_id_
));
std
::
vector
<
float
>
execution_time_series
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
execution_time
=
0
;
hip_check_error
(
hipDeviceSynchronize
());
hip_check_error
(
hipEventRecord
(
start
,
stream_config
.
stream_id_
));
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
hip_check_error
(
hipEventRecord
(
stop
,
stream_config
.
stream_id_
));
hip_check_error
(
hipEventSynchronize
(
stop
));
hip_check_error
(
hipEventElapsedTime
(
&
execution_time
,
start
,
stop
));
execution_time_series
.
push_back
(
execution_time
);
}
hip_check_error
(
hipEventRecord
(
stop
,
stream_config
.
stream_id_
))
;
hip_check_error
(
hipEventSynchronize
(
stop
))
;
float
mean_execution_time
=
0
;
float
median_execution_time
=
0
;
float
total_time
=
0
;
#if DEBUG_LOG
for
(
int
i
=
0
;
i
<
nrepeat
;
i
++
){
std
::
cout
<<
i
<<
" th launch, execution time = "
<<
execution_time_series
[
i
]
<<
" ms"
<<
std
::
endl
;
}
#endif
hip_check_error
(
hipEventElapsedTime
(
&
total_time
,
start
,
stop
));
std
::
sort
(
execution_time_series
.
begin
(),
execution_time_series
.
end
(
));
return
total_time
/
nrepeat
;
mean_execution_time
=
std
::
reduce
(
execution_time_series
.
begin
(),
execution_time_series
.
end
(),
.0
)
/
static_cast
<
float
>
(
nrepeat
);
median_execution_time
=
execution_time_series
[
execution_time_series
.
size
()
/
2
];
if
(
stream_config
.
time_kernel_
==
1
)
return
mean_execution_time
;
else
return
median_execution_time
;
}
else
{
...
...
include/ck/stream_config.hpp
View file @
c47159b0
...
...
@@ -9,7 +9,7 @@
struct
StreamConfig
{
hipStream_t
stream_id_
=
nullptr
;
bool
time_kernel_
=
false
;
int
time_kernel_
=
0
;
int
log_level_
=
0
;
int
cold_niters_
=
50
;
int
nrepeat_
=
200
;
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
View file @
c47159b0
...
...
@@ -123,7 +123,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1
template
<
typename
DstBuffer
,
index_t
ThreadScratchId
=
0
>
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstBuffer
&
__restrict__
dst_buf
,
DstBuffer
&
dst_buf
,
Number
<
ThreadScratchId
>
thread_scratch_id
=
Number
<
ThreadScratchId
>
{})
{
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1_doublelds.hpp
View file @
c47159b0
...
...
@@ -30,6 +30,7 @@ __global__ void
__shared__
char
p_shared_0
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared_1
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
karg
.
p_a_grid
,
karg
.
p_b_grid
,
karg
.
p_c_grid
,
p_shared_0
,
p_shared_1
,
karg
);
#else
...
...
@@ -46,9 +47,9 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemm_xdl_cshuffle_v1
(
const
FloatA
*
__restrict__
p_a_grid
,
const
FloatB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
kernel_gemm_xdl_cshuffle_v1
(
const
FloatA
*
p_a_grid
,
const
FloatB
*
p_b_grid
,
FloatC
*
p_c_grid
,
typename
GridwiseGemm
::
Problem
problem
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
...
...
@@ -706,11 +707,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
using
Block2CTileMap
=
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
NPerBlock
>
;
template
<
bool
HasMainKBlockLoop
>
__device__
static
void
Run
(
const
FloatA
*
__restrict__
p_a_grid
,
const
FloatB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
void
*
__restrict__
p_shared_0
,
void
*
__restrict__
p_shared_1
,
__device__
static
void
Run
(
const
FloatA
*
p_a_grid
,
const
FloatB
*
p_b_grid
,
FloatC
*
p_c_grid
,
void
*
p_shared_0
,
void
*
p_shared_1
,
const
Problem
&
problem
)
{
const
auto
a_grid_desc_ak0_m_ak1
=
MakeAGridDescriptor_AK0_M_AK1
(
...
...
@@ -872,17 +873,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf_ping
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ComputeTypeA
*
__restrict__
>
(
p_shared_0
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
static_cast
<
ComputeTypeA
*>
(
p_shared_0
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf_ping
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ComputeTypeB
*
__restrict__
>
(
p_shared_0
)
+
a_block_space_size_aligned
,
static_cast
<
ComputeTypeB
*>
(
p_shared_0
)
+
a_block_space_size_aligned
,
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
auto
a_block_buf_pong
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ComputeTypeA
*
__restrict__
>
(
p_shared_1
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
static_cast
<
ComputeTypeA
*>
(
p_shared_1
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf_pong
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
ComputeTypeB
*
__restrict__
>
(
p_shared_1
)
+
a_block_space_size_aligned
,
static_cast
<
ComputeTypeB
*>
(
p_shared_1
)
+
a_block_space_size_aligned
,
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
auto
a_block_bufs
=
make_tuple
(
a_block_buf_ping
,
a_block_buf_pong
);
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
c47159b0
...
...
@@ -1050,7 +1050,7 @@ struct ThreadwiseTensorSliceTransfer_v4
typename
DstBuffer
>
__device__
void
Run
(
const
SrcDesc
&
,
const
SrcRefToOriginDisplacement
&
,
const
SrcBuffer
&
__restrict__
src_buf
,
const
SrcBuffer
&
src_buf
,
const
DstDesc
&
,
const
DstOriginIdx
&
,
DstBuffer
&
dst_buf
)
const
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
c47159b0
...
...
@@ -382,7 +382,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
template
<
typename
DstBuffer
,
index_t
ThreadScratchId
=
0
>
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstBuffer
&
__restrict__
dst_buf
,
DstBuffer
&
dst_buf
,
Number
<
ThreadScratchId
>
thread_scratch_id
=
Number
<
ThreadScratchId
>
{})
{
// if there is transpose, it's done here
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
c47159b0
...
...
@@ -25,16 +25,16 @@ struct DynamicBuffer
{
using
type
=
T
;
T
*
__restrict__
p_data_
;
T
*
p_data_
;
ElementSpaceSize
element_space_size_
;
T
invalid_element_value_
=
T
{
0
};
__host__
__device__
constexpr
DynamicBuffer
(
T
*
__restrict__
p_data
,
ElementSpaceSize
element_space_size
)
__host__
__device__
constexpr
DynamicBuffer
(
T
*
p_data
,
ElementSpaceSize
element_space_size
)
:
p_data_
{
p_data
},
element_space_size_
{
element_space_size
}
{
}
__host__
__device__
constexpr
DynamicBuffer
(
T
*
__restrict__
p_data
,
__host__
__device__
constexpr
DynamicBuffer
(
T
*
p_data
,
ElementSpaceSize
element_space_size
,
T
invalid_element_value
)
:
p_data_
{
p_data
},
...
...
@@ -410,7 +410,7 @@ template <AddressSpaceEnum BufferAddressSpace,
AmdBufferCoherenceEnum
coherence
=
AmdBufferCoherenceEnum
::
DefaultCoherence
,
typename
T
,
typename
ElementSpaceSize
>
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
__restrict__
p
,
ElementSpaceSize
element_space_size
)
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
)
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
,
coherence
>
{
p
,
element_space_size
};
...
...
@@ -424,7 +424,7 @@ template <
typename
X
,
typename
enable_if
<
is_same
<
remove_cvref_t
<
T
>,
remove_cvref_t
<
X
>>::
value
,
bool
>::
type
=
false
>
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
__restrict__
p
,
ElementSpaceSize
element_space_size
,
X
invalid_element_value
)
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
,
X
invalid_element_value
)
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
false
,
coherence
>
{
p
,
element_space_size
,
invalid_element_value
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment