Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2564c493
Commit
2564c493
authored
Aug 13, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into fused-gemm
parents
000eefbf
10b3278b
Changes
28
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
3100 additions
and
0 deletions
+3100
-0
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
...peration/gpu/device/device_gemm_multiple_d_multiple_r.hpp
+85
-0
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
...device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+873
-0
include/ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp
...ensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp
+521
-0
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+29
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
...grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+901
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
...operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+677
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+4
-0
include/ck/utility/synchronization.hpp
include/ck/utility/synchronization.hpp
+10
-0
No files found.
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
0 → 100644
View file @
2564c493
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// FIXME: DeviceGemmReduce type need to well define the problem
template
<
typename
ALayout
,
typename
BLayout
,
typename
DELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
RsDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
typename
QsElementwiseOperation
,
typename
RsElementwiseOperation
>
struct
DeviceGemmMultipleDMultipleR
:
public
BaseOperator
{
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
static
constexpr
index_t
NumRTensor
=
RsDataType
::
Size
();
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds
,
void
*
p_e
,
std
::
array
<
void
*
,
NumRTensor
>
p_rs
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
StrideA
,
ck
::
index_t
StrideB
,
std
::
array
<
ck
::
index_t
,
NumDTensor
>
StrideDs
,
ck
::
index_t
StrideE
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
,
QsElementwiseOperation
qs_element_op
,
RsElementwiseOperation
rs_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
template
<
typename
ALayout
,
typename
BLayout
,
typename
DELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
RsDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
typename
QsElementwiseOperation
,
typename
RsElementwiseOperation
>
using
DeviceGemmMultipleDMultipleRPtr
=
std
::
unique_ptr
<
DeviceGemmMultipleDMultipleR
<
ALayout
,
BLayout
,
DELayout
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
RsDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CDEElementwiseOperation
,
QsElementwiseOperation
,
RsElementwiseOperation
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
0 → 100644
View file @
2564c493
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp
0 → 100644
View file @
2564c493
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
2564c493
...
...
@@ -130,6 +130,35 @@ struct AddHardswishAdd
}
};
// C = A * B
// E = C + D0 + D1
struct
AddAdd
{
template
<
typename
E
,
typename
C
,
typename
D0
,
typename
D1
>
__host__
__device__
void
operator
()(
E
&
e
,
const
C
&
c
,
const
D0
&
d0
,
const
D1
&
d1
)
const
{
// Only support floating so far
static_assert
(
is_same
<
E
,
half_t
>::
value
||
is_same
<
E
,
float
>::
value
||
is_same
<
E
,
double
>::
value
,
"Data type is not supported by this operation!"
);
static_assert
(
is_same
<
C
,
half_t
>::
value
||
is_same
<
C
,
float
>::
value
||
is_same
<
C
,
double
>::
value
,
"Data type is not supported by this operation!"
);
static_assert
(
is_same
<
D0
,
half_t
>::
value
||
is_same
<
D0
,
float
>::
value
||
is_same
<
D0
,
double
>::
value
,
"Data type is not supported by this operation!"
);
static_assert
(
is_same
<
D1
,
half_t
>::
value
||
is_same
<
D1
,
float
>::
value
||
is_same
<
D1
,
double
>::
value
,
"Data type is not supported by this operation!"
);
const
C
y
=
c
+
type_convert
<
C
>
(
d0
)
+
type_convert
<
C
>
(
d1
);
e
=
type_convert
<
E
>
(
y
);
}
};
// C = A * B
// E = FastGelu(C + D0 + D1)
struct
AddAddFastGelu
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
0 → 100644
View file @
2564c493
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
0 → 100644
View file @
2564c493
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
2564c493
...
...
@@ -1192,6 +1192,10 @@ struct ThreadwiseTensorSliceTransfer_v4
move_tensor_coordinate
(
SrcDesc
{},
src_ref_coord_
,
src_slice_move_step_iter
);
}
__device__
void
SetSrcCoord
(
const
Index
&
src_ref_idx
)
{
src_ref_coord_
=
make_tensor_coordinate
(
SrcDesc
{},
src_ref_idx
);
}
private:
SrcCoord
src_ref_coord_
;
...
...
include/ck/utility/synchronization.hpp
View file @
2564c493
...
...
@@ -18,5 +18,15 @@ __device__ void block_sync_lds()
__syncthreads
();
#endif
}
__device__
void
s_nop
()
{
#if 1
asm
volatile
(
"\
s_nop 0
\n
\
"
::
);
#else
__builtin_amdgcn_sched_barrier
(
0
);
#endif
}
}
// namespace ck
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment