Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
39a1f853
Commit
39a1f853
authored
Sep 15, 2023
by
Harisankar Sadasivan
Browse files
clang-format changes for pr881
parent
a20863b0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
27 additions
and
30 deletions
+27
-30
example/53_gemv_splitk/gemv_splitk_fp16.cpp
example/53_gemv_splitk/gemv_splitk_fp16.cpp
+4
-6
include/ck/tensor_operation/gpu/device/device_gemv.hpp
include/ck/tensor_operation/gpu/device/device_gemv.hpp
+13
-14
include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
...k/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
+1
-1
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+4
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp
...ude/ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp
+5
-5
No files found.
example/53_gemv_splitk/gemv_splitk_fp16.cpp
View file @
39a1f853
...
...
@@ -19,12 +19,10 @@ using CElementOp = PassThrough;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
#define K1 8 //K1PerThread:2,4,8
#define K0 4 //K0PerBlock:1,2,3,4...32
#define N1 2 //Nperthread:2,4,8
#define B 64 //block-size:64
#define K1 8 // K1PerThread:2,4,8
#define K0 4 // K0PerBlock:1,2,3,4...32
#define N1 2 // Nperthread:2,4,8
#define B 64 // block-size:64
// clang-format off
using
DeviceGemvInstance
=
ck
::
tensor_operation
::
device
::
deviceGemvDl
/*
...
...
include/ck/tensor_operation/gpu/device/device_gemv.hpp
View file @
39a1f853
...
...
@@ -20,20 +20,19 @@ template <typename ALayout,
typename
CElementwiseOperation
>
struct
DeviceGemv
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
StrideA
,
ck
::
index_t
StrideB
,
ck
::
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
ck
::
index_t
KBatch
=
1
)
=
0
;
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
StrideA
,
ck
::
index_t
StrideB
,
ck
::
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
ck
::
index_t
KBatch
=
1
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemv_splitk.hpp
View file @
39a1f853
...
...
@@ -271,7 +271,7 @@ struct deviceGemvDl : public DeviceGemv<ALayout,
return
false
;
}
}
// //
// //
// polymorphic
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
...
...
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
39a1f853
...
...
@@ -658,14 +658,14 @@ struct BlockToCTileMap_3DGrid_KSplit
return
make_tuple
(
blockIdx
.
z
,
blockIdx
.
y
,
blockIdx
.
x
);
}
//HS: Map 1D block-id to 3D tuple (M,N,K)
//
HS: Map 1D block-id to 3D tuple (M,N,K)
__host__
__device__
inline
constexpr
auto
convert_1D_block_idx_to_3D_tuple
(
const
index_t
&
block_1d_id
,
const
index_t
&
N
,
const
index_t
&
k_batch
)
const
{
const
auto
Ndim
=
math
::
integer_divide_ceil
(
N
,
NPerBlock
);
const
auto
Ndim
=
math
::
integer_divide_ceil
(
N
,
NPerBlock
);
return
make_tuple
(((
block_1d_id
)
/
(
k_batch
*
Ndim
)),
(((
block_1d_id
)
/
k_batch
)
%
Ndim
),
(
block_1d_id
)
%
k_batch
);
// returns 3D tuple as (Mid,Nid,Kid)
(((
block_1d_id
)
/
k_batch
)
%
Ndim
),
(
block_1d_id
)
%
k_batch
);
// returns 3D tuple as (Mid,Nid,Kid)
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemv_splitk.hpp
View file @
39a1f853
...
...
@@ -27,12 +27,12 @@ template <typename GridwiseGemv,
typename
Block2CTileMap
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemv_dl_v1r3
(
typename
GridwiseGemv
::
Argument
karg
,
const
Block2CTileMap
&
block_2_ctile_map
)
//: in __global__ functions, struct is
// better for reduced load overhead
kernel_gemv_dl_v1r3
(
typename
GridwiseGemv
::
Argument
karg
,
const
Block2CTileMap
&
block_2_ctile_map
)
//: in __global__ functions, struct is
// better for reduced load overhead
{
constexpr
index_t
shared_block_size
=
GridwiseGemv
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment