Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5f11dccc
Unverified
Commit
5f11dccc
authored
Apr 06, 2021
by
Chao Liu
Committed by
GitHub
Apr 06, 2021
Browse files
Merge branch 'master' into dynamic_tensor_descriptor_v5r1
parents
56d827be
d2217f30
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
551 additions
and
1368 deletions
+551
-1368
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+300
-789
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+200
-523
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+41
-52
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+10
-4
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
5f11dccc
This diff is collapsed.
Click to expand it.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
5f11dccc
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
5f11dccc
...
...
@@ -11,6 +11,47 @@
namespace
ck
{
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by __CONSTANT__ void pointer
// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
// non-modifiable parameter address space, so compiler can enable corresponding optimization
template
<
typename
GridwiseGemm
,
typename
AGlobalDesc
,
typename
FloatA
,
typename
BGlobalDesc
,
typename
FloatB
,
typename
CGlobalDesc
,
typename
FloatC
,
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__global__
void
run_gridwise_dynamic_gemm_v1
(
const
void
__CONSTANT__
*
p_a_k_m_global_desc
,
const
FloatA
*
__restrict__
p_a_global
,
const
void
__CONSTANT__
*
p_b_k_n_global_desc
,
const
FloatB
*
__restrict__
p_b_global
,
const
void
__CONSTANT__
*
p_c_m0_m1_n0_n1_global_desc
,
FloatC
*
__restrict__
p_c_global
)
{
// first cast void __CONSTANT__* to void*
// second cast void* to Desc*
// the copy constructor of tensor descriptor doesn't take address_space(4)
const
auto
a_k_m_global_desc
=
*
reinterpret_cast
<
const
AGlobalDesc
*>
((
const
void
*
)
p_a_k_m_global_desc
);
const
auto
b_k_n_global_desc
=
*
reinterpret_cast
<
const
BGlobalDesc
*>
((
const
void
*
)
p_b_k_n_global_desc
);
const
auto
c_m0_m1_n0_n1_global_desc
=
*
reinterpret_cast
<
const
CGlobalDesc
*>
((
const
void
*
)
p_c_m0_m1_n0_n1_global_desc
);
GridwiseGemm
{}.
Run
(
a_k_m_global_desc
,
p_a_global
,
b_k_n_global_desc
,
p_b_global
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
#endif
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
...
...
@@ -427,7 +468,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
}
}
// pass tensor descriptor by reference
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
AGlobalDesc
&
a_k_m_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
...
...
@@ -452,57 +492,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_v1
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
// pass tensor descriptors by pointers
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
AGlobalDesc
*
p_a_k_m_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
const
BGlobalDesc
*
p_b_k_n_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
*
p_c_m0_m1_n0_n1_global_desc
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
const
auto
a_k_m_global_desc
=
*
p_a_k_m_global_desc
;
const
auto
b_k_n_global_desc
=
*
p_b_k_n_global_desc
;
const
auto
c_m0_m1_n0_n1_global_desc
=
*
p_c_m0_m1_n0_n1_global_desc
;
Run
(
a_k_m_global_desc
,
p_a_global
,
b_k_n_global_desc
,
p_b_global
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
// pass tensor descriptors by void*
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
void
*
p_a_k_m_global_desc
,
const
FloatAB
*
__restrict__
p_a_global
,
const
void
*
p_b_k_n_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
void
*
p_c_m0_m1_n0_n1_global_desc
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
const
auto
a_k_m_global_desc
=
*
reinterpret_cast
<
const
AGlobalDesc
*>
(
p_a_k_m_global_desc
);
const
auto
b_k_n_global_desc
=
*
reinterpret_cast
<
const
BGlobalDesc
*>
(
p_b_k_n_global_desc
);
const
auto
c_m0_m1_n0_n1_global_desc
=
*
reinterpret_cast
<
const
CGlobalDesc
*>
(
p_c_m0_m1_n0_n1_global_desc
);
Run
(
a_k_m_global_desc
,
p_a_global
,
b_k_n_global_desc
,
p_b_global
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
};
}
// namespace ck
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
5f11dccc
...
...
@@ -7,13 +7,20 @@
#endif
#include "bfloat16_dev.hpp"
// address space for kernel parameter
#define __CONSTANT__ __attribute__((address_space(4)))
// device backend
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#define CK_AMD_GPU_GFX906 0
#define CK_AMD_GPU_GFX908 0
#if 0
#define CK_AMD_GPU_GFX906 1
#elif 0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1
#endif
// HIP version
#ifndef CK_HIP_VERSION_FLAT
...
...
@@ -104,9 +111,8 @@
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
// pass tensor descriptor by value
, pointer
or void*
// pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment