Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
37857dbd
Commit
37857dbd
authored
Jul 05, 2022
by
root
Browse files
added block_lds without barrier sync
parent
13ceb494
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
48 additions
and
35 deletions
+48
-35
example/01_gemm/gemm_xdl_fp16.cpp
example/01_gemm/gemm_xdl_fp16.cpp
+14
-15
include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
...tion/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
+10
-8
include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
.../tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
+1
-1
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
...tion/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+13
-11
include/ck/utility/synchronization.hpp
include/ck/utility/synchronization.hpp
+10
-0
No files found.
example/01_gemm/gemm_xdl_fp16.cpp
View file @
37857dbd
...
@@ -6,19 +6,18 @@
...
@@ -6,19 +6,18 @@
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "ck/ck.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "host_tensor.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "host_tensor_generator.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdl.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_gemm_xdl_cshuffle.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "device_gemm_xdl_waveletmodel_cshuffle.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "gemm_specialization.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -165,8 +164,8 @@ int main(int argc, char* argv[])
...
@@ -165,8 +164,8 @@ int main(int argc, char* argv[])
auto
c_element_op
=
CElementOp
{};
auto
c_element_op
=
CElementOp
{};
// do GEMM
// do GEMM
//replace DeviceGemmInstance_WaveletModel for wavelet gemm pipeline
//
replace DeviceGemmInstance_WaveletModel for wavelet gemm pipeline
//auto gemm = DeviceGemmInstance_WaveletModel{};
//
auto gemm = DeviceGemmInstance_WaveletModel{};
auto
gemm
=
DeviceGemmInstance
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
View file @
37857dbd
#pragma once
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_gemm.hpp"
#include "ck/utility/common_header.hpp"
#include "common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_layout.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "gridwise_gemm_xdl_waveletmodel_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp
View file @
37857dbd
#pragma once
#pragma once
#include "common_header.hpp"
#include "
ck/utility/
common_header.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
View file @
37857dbd
#pragma once
#pragma once
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/utility/common_header.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "blockwise_gemm_xdlops.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "gridwise_gemm_waveletmodel.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_waveletmodel.hpp"
namespace
ck
{
namespace
ck
{
template
<
typename
GridwiseGemm
,
template
<
typename
GridwiseGemm
,
...
@@ -689,8 +693,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
...
@@ -689,8 +693,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
// 4th mfma instruction output space : 3 7 11 15 19 ....
// 4th mfma instruction output space : 3 7 11 15 19 ....
// you can pack 4 registers output space into 2WORD and do global write
// you can pack 4 registers output space into 2WORD and do global write
// (no LDS swizzling required)
// (no LDS swizzling required)
// 2. avoid using s_barrier in this case where not all 256 threads required to
// swizzle c layout
static_for
<
0
,
num_access
,
1
>
{}([
&
](
auto
access_id
)
{
static_for
<
0
,
num_access
,
1
>
{}([
&
](
auto
access_id
)
{
// make sure it's safe to write to LDS
// make sure it's safe to write to LDS
...
...
include/ck/utility/synchronization.hpp
View file @
37857dbd
...
@@ -18,5 +18,15 @@ __device__ void block_sync_lds()
...
@@ -18,5 +18,15 @@ __device__ void block_sync_lds()
__syncthreads
();
__syncthreads
();
#endif
#endif
}
}
__device__
void
block_lds
()
{
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm
volatile
(
"\
s_waitcnt lgkmcnt(0) \
"
::
);
#else
__syncthreads
();
#endif
}
}
// namespace ck
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment