Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
24af0144
Unverified
Commit
24af0144
authored
Nov 12, 2022
by
Po Yen Chen
Committed by
GitHub
Nov 12, 2022
Browse files
Merge branch 'develop' into gemm_layernorm_welford
parents
961f5e9e
b79bbbc2
Changes
813
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1665 additions
and
435 deletions
+1665
-435
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+1
-0
example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
...le/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+115
-0
example/45_elementwise_normalization/CMakeLists.txt
example/45_elementwise_normalization/CMakeLists.txt
+1
-0
example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
...entwise_normalization/elementwise_layernorm_blockwise.cpp
+195
-0
example/CMakeLists.txt
example/CMakeLists.txt
+2
-0
include/ck/ck.hpp
include/ck/ck.hpp
+7
-12
include/ck/tensor_description/tensor_space_filling_curve.hpp
include/ck/tensor_description/tensor_space_filling_curve.hpp
+6
-4
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+38
-2
include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
...operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+2
-1
include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
...n/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+39
-28
include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
.../tensor_operation/gpu/device/device_batchnorm_forward.hpp
+13
-8
include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
...ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
+3
-1
include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
...operation/gpu/device/device_elementwise_normalization.hpp
+68
-0
include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
...on/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
...r_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+11
-10
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
...k/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+55
-0
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
.../gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+837
-0
include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
...n/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
+25
-19
include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
...device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+235
-313
include/ck/tensor_operation/gpu/device/device_normalization.hpp
...e/ck/tensor_operation/gpu/device/device_normalization.hpp
+11
-36
No files found.
example/44_elementwise_permute/CMakeLists.txt
0 → 100644
View file @
24af0144
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
0 → 100644
View file @
24af0144
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
PassThrough
,
4
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
;
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_nchw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_nchw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
A_nchw
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_nchw
.
mDesc
.
GetLengths
()[
3
];
++
w
)
{
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor
(
B_nhwc
(
n
,
h
,
w
,
c
),
a_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
128
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
3
]),
1
};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
static_cast
<
int
>
(
nhwc
[
1
]
*
nhwc
[
2
]
*
nhwc
[
3
]),
1
,
static_cast
<
int
>
(
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
3
])};
std
::
copy
(
nchw
.
begin
(),
nchw
.
end
(),
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A (nchw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nhwc): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
example/45_elementwise_normalization/CMakeLists.txt
0 → 100644
View file @
24af0144
add_example_executable
(
example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp
)
example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
using
ADataType
=
ck
::
half_t
;
// Input 1
using
BDataType
=
ck
::
half_t
;
// Input 2
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
XElementwiseOperation
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
YElementwiseOperation
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
// X = Elementwise(input1, input2, input3, ...)
// Y = Layernorm(X, beta, gamma)
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
ADataType
,
BDataType
>
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
8
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
>
;
// OutScalarPerVector
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_elementwise2D
(
HostTensorC
&
C
,
const
HostTensorA
&
A
,
const
HostTensorB
&
B
,
const
std
::
vector
<
std
::
size_t
>&
shape
,
Functor
functor
)
{
using
ctype
=
ck
::
remove_reference_t
<
decltype
(
C
(
0
,
0
))
>
;
for
(
std
::
size_t
m
=
0
;
m
<
shape
[
0
];
++
m
)
for
(
std
::
size_t
n
=
0
;
n
<
shape
[
1
];
++
n
)
{
auto
a_val
=
A
(
m
,
n
);
auto
b_val
=
B
(
m
,
n
);
ctype
c_val
=
0
;
functor
(
c_val
,
a_val
,
b_val
);
C
(
m
,
n
)
=
c_val
;
}
}
int
main
()
{
bool
time_kernel
=
true
;
ck
::
index_t
M
=
48
*
256
;
ck
::
index_t
N
=
1024
;
ck
::
index_t
Stride
=
N
;
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
len
}),
std
::
vector
<
std
::
size_t
>
({
stride
}));
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
};
Tensor
<
ADataType
>
a
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
Tensor
<
BDataType
>
b
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
Tensor
<
GammaDataType
>
gamma
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
BetaDataType
>
beta
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
YDataType
>
y
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
GammaDataType
>
{
-
5
,
5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_2
<
BetaDataType
>
{
-
5
,
5
});
DeviceMem
a_dev
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_dev
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
a_dev
.
ToDevice
(
a
.
mData
.
data
());
b_dev
.
ToDevice
(
b
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_dev
.
GetDeviceBuffer
(),
b_dev
.
GetDeviceBuffer
()};
auto
device_instance
=
DeviceInstance
{};
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
{
M
,
N
},
{
std
::
vector
<
ck
::
index_t
>
{
a
.
mDesc
.
GetStrides
().
begin
(),
a
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
b
.
mDesc
.
GetStrides
().
begin
(),
b
.
mDesc
.
GetStrides
().
end
()},
},
{
0
,
1
},
{
0
,
1
},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
{
1
},
1e-4
,
input
,
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
XElementwiseOperation
{},
YElementwiseOperation
{});
if
(
!
device_instance
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cout
<<
"The runtime parameters are not supported"
<<
std
::
endl
;
return
1
;
};
auto
invoker_ptr
=
device_instance
.
MakeInvokerPointer
();
float
ela_time
=
0
;
ela_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
data_mem_size
=
M
*
N
*
sizeof
(
ADataType
)
+
M
*
N
*
sizeof
(
BDataType
)
+
M
*
N
*
sizeof
(
YDataType
)
+
N
*
sizeof
(
GammaDataType
)
+
N
*
sizeof
(
BetaDataType
);
float
bandwidth
=
data_mem_size
*
1000
/
ela_time
/
1024
/
1024
/
1024
;
std
::
cout
<<
"Bandwidth is : "
<<
bandwidth
<<
"GB/s . "
<<
std
::
endl
;
std
::
cout
<<
"Time elapase is : "
<<
ela_time
<<
" ms . "
<<
std
::
endl
;
bool
pass
=
true
;
{
std
::
vector
<
std
::
size_t
>
mn
=
{
static_cast
<
unsigned
long
>
(
M
),
static_cast
<
unsigned
long
>
(
N
)};
Tensor
<
XDataType
>
x
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
host_elementwise2D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
Tensor
<
XDataType
>
,
XElementwiseOperation
>
(
x
,
a
,
b
,
mn
,
XElementwiseOperation
{});
Tensor
<
YDataType
>
host_y
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
YElementwiseOperation
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
YElementwiseOperation
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
y_dev
.
FromDevice
(
y
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results d1"
,
1e-3
,
1e-3
);
if
(
!
(
pass
))
{
std
::
cout
<<
"layernorm wrong"
<<
std
::
endl
;
}
}
return
(
pass
?
0
:
1
);
}
example/CMakeLists.txt
View file @
24af0144
...
@@ -12,6 +12,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
...
@@ -12,6 +12,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
add_test
(
NAME
${
EXAMPLE_NAME
}
COMMAND $<TARGET_FILE:
${
EXAMPLE_NAME
}
>
${
ARGN
}
)
add_test
(
NAME
${
EXAMPLE_NAME
}
COMMAND $<TARGET_FILE:
${
EXAMPLE_NAME
}
>
${
ARGN
}
)
add_dependencies
(
examples
${
EXAMPLE_NAME
}
)
add_dependencies
(
examples
${
EXAMPLE_NAME
}
)
add_dependencies
(
check
${
EXAMPLE_NAME
}
)
add_dependencies
(
check
${
EXAMPLE_NAME
}
)
rocm_install
(
TARGETS
${
EXAMPLE_NAME
}
COMPONENT examples
)
endfunction
(
add_example_executable EXAMPLE_NAME
)
endfunction
(
add_example_executable EXAMPLE_NAME
)
function
(
add_example_executable_no_testing EXAMPLE_NAME FILE_NAME
)
function
(
add_example_executable_no_testing EXAMPLE_NAME FILE_NAME
)
...
@@ -19,6 +20,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
...
@@ -19,6 +20,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
target_link_libraries
(
${
EXAMPLE_NAME
}
PRIVATE utility
)
target_link_libraries
(
${
EXAMPLE_NAME
}
PRIVATE utility
)
add_dependencies
(
examples
${
EXAMPLE_NAME
}
)
add_dependencies
(
examples
${
EXAMPLE_NAME
}
)
rocm_install
(
TARGETS
${
EXAMPLE_NAME
}
COMPONENT examples
)
endfunction
(
add_example_executable_no_testing EXAMPLE_NAME
)
endfunction
(
add_example_executable_no_testing EXAMPLE_NAME
)
# add all example subdir
# add all example subdir
...
...
include/ck/ck.hpp
View file @
24af0144
...
@@ -126,8 +126,14 @@
...
@@ -126,8 +126,14 @@
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
// experimental feature: optimize for inter-wave scheduling policy
// experimental feature: optimize for inter-wave scheduling policy
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
0
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
1
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
// this will let make_default_loop_scheduler() return interwave scheduling flag by default
#define CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING 0
// experimental feature: add instances using interwave scheduling
#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
// experimental feature: add instances using pipeline v2
#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
...
@@ -144,17 +150,6 @@
...
@@ -144,17 +150,6 @@
// workaround: compiler gnerating inefficient ds_write instructions
// workaround: compiler gnerating inefficient ds_write instructions
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
// (gfx908 only) workaround: compiler crash in fused kernels on mainline #9110; #10738 seems ok
// error message was "fatal error: error in backend: Error while trying to spill VGPR0 from class
// VGPR_32: Cannot scavenge register without an emergency spill slot!"
// this fall back to less ideal way of handle NPadding in fused attention kernel
#ifdef __gfx908__
#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 1
#else
// for __gfx90a__, ...
#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 0
#endif // __gfx908__
// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
// tuning parameter
// tuning parameter
#define CK_WORKAROUND_SWDEV_325164 0
#define CK_WORKAROUND_SWDEV_325164 0
...
...
include/ck/tensor_description/tensor_space_filling_curve.hpp
View file @
24af0144
...
@@ -14,7 +14,8 @@ namespace ck {
...
@@ -14,7 +14,8 @@ namespace ck {
template
<
typename
TensorLengths
,
template
<
typename
TensorLengths
,
typename
DimAccessOrder
,
typename
DimAccessOrder
,
typename
ScalarsPerAccess
>
// # of scalars per access in each dimension
typename
ScalarsPerAccess
,
bool
SnakeCurved
=
true
>
// # of scalars per access in each dimension
struct
SpaceFillingCurve
struct
SpaceFillingCurve
{
{
static
constexpr
index_t
nDim
=
TensorLengths
::
Size
();
static
constexpr
index_t
nDim
=
TensorLengths
::
Size
();
...
@@ -136,9 +137,10 @@ struct SpaceFillingCurve
...
@@ -136,9 +137,10 @@ struct SpaceFillingCurve
Index
ordered_idx
;
Index
ordered_idx
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
ordered_idx
(
idim
)
=
forward_sweep
[
idim
]
?
ordered_access_idx
[
idim
]
ordered_idx
(
idim
)
=
:
ordered_access_lengths
[
idim
]
-
1
-
!
SnakeCurved
||
forward_sweep
[
idim
]
ordered_access_idx
[
idim
];
?
ordered_access_idx
[
idim
]
:
ordered_access_lengths
[
idim
]
-
1
-
ordered_access_idx
[
idim
];
});
});
return
container_reorder_given_old2new
(
ordered_idx
,
dim_access_order
)
*
return
container_reorder_given_old2new
(
ordered_idx
,
dim_access_order
)
*
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
24af0144
...
@@ -18,11 +18,11 @@ enum struct LoopScheduler
...
@@ -18,11 +18,11 @@ enum struct LoopScheduler
constexpr
LoopScheduler
make_default_loop_scheduler
()
constexpr
LoopScheduler
make_default_loop_scheduler
()
{
{
#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
#if CK_EXPERIMENTAL_
DEFAULT_TO_
INTER_WAVE_SCHEDULING
return
LoopScheduler
::
Interwave
;
return
LoopScheduler
::
Interwave
;
#else
#else
return
LoopScheduler
::
Default
;
return
LoopScheduler
::
Default
;
#endif // if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
#endif // if CK_EXPERIMENTAL_
DEFAULT_TO_
INTER_WAVE_SCHEDULING
}
}
template
<
index_t
MNXdlPerWave
,
index_t
MNWaves
,
index_t
MNPerXdl
,
typename
TileDesc_K0_MN_K1
>
template
<
index_t
MNXdlPerWave
,
index_t
MNWaves
,
index_t
MNPerXdl
,
typename
TileDesc_K0_MN_K1
>
...
@@ -151,6 +151,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -151,6 +151,27 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
return
make_tuple
(
c_thread_m
,
c_thread_n
);
return
make_tuple
(
c_thread_m
,
c_thread_n
);
}
}
template
<
index_t
m0
,
index_t
n0
,
index_t
xdlops_i
,
index_t
blk_i
>
__device__
static
auto
CalculateCThreadOriginDataIndex8D
(
Number
<
m0
>
,
Number
<
n0
>
,
Number
<
xdlops_i
>
,
Number
<
blk_i
>
)
{
const
auto
wave_idx
=
GetWaveIdx
();
const
auto
waveId_m
=
wave_idx
[
I0
];
const
auto
waveId_n
=
wave_idx
[
I1
];
const
auto
blk_idx
=
xdlops_gemm
.
GetBeginOfThreadBlk4D
(
xdlops_i
,
blk_i
);
return
make_tuple
(
Number
<
m0
>
{},
Number
<
n0
>
{},
waveId_m
,
waveId_n
,
blk_idx
[
I0
],
blk_idx
[
I1
],
blk_idx
[
I2
],
blk_idx
[
I3
]);
}
__host__
__device__
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
()
__host__
__device__
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
()
{
{
static_assert
(
AK0MK1BlockDesc
::
IsKnownAtCompileTime
()
&&
static_assert
(
AK0MK1BlockDesc
::
IsKnownAtCompileTime
()
&&
...
@@ -724,6 +745,21 @@ struct BlockwiseGemmXdlops_v2
...
@@ -724,6 +745,21 @@ struct BlockwiseGemmXdlops_v2
return
make_tuple
(
c_thread_m
,
c_thread_n
);
return
make_tuple
(
c_thread_m
,
c_thread_n
);
}
}
template
<
index_t
m0
,
index_t
n0
,
index_t
xdlops_i
,
index_t
blk_i
>
__device__
static
auto
CalculateCThreadOriginDataIndex8D
(
Number
<
m0
>
,
Number
<
n0
>
,
Number
<
xdlops_i
>
,
Number
<
blk_i
>
)
{
const
auto
wave_idx
=
GetWaveIdx
();
const
auto
waveId_m
=
wave_idx
[
I0
];
const
auto
waveId_n
=
wave_idx
[
I1
];
const
auto
blk_idx
=
xdlops_gemm
.
GetBeginOfThreadBlk4D
(
xdlops_i
,
blk_i
);
return
make_tuple
(
m0
,
n0
,
waveId_m
,
waveId_n
,
blk_idx
[
I0
],
blk_idx
[
I1
],
blk_idx
[
I2
],
blk_idx
[
I3
]);
}
using
Tuple4
=
decltype
(
CalculateAThreadOriginDataIndex
());
using
Tuple4
=
decltype
(
CalculateAThreadOriginDataIndex
());
__host__
__device__
BlockwiseGemmXdlops_v2
(
Tuple4
a_origin
=
CalculateAThreadOriginDataIndex
(),
__host__
__device__
BlockwiseGemmXdlops_v2
(
Tuple4
a_origin
=
CalculateAThreadOriginDataIndex
(),
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
View file @
24af0144
...
@@ -24,7 +24,8 @@ template <typename ALayout,
...
@@ -24,7 +24,8 @@ template <typename ALayout,
typename
B0ElementwiseOperation
,
typename
B0ElementwiseOperation
,
typename
Acc0ElementwiseOperation
,
typename
Acc0ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
,
bool
MaskOutUpperTriangle
>
// TODO: enum for mask type
struct
DeviceBatchedGemmSoftmaxGemm
:
public
BaseOperator
struct
DeviceBatchedGemmSoftmaxGemm
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
View file @
24af0144
...
@@ -7,49 +7,60 @@
...
@@ -7,49 +7,60 @@
#include <vector>
#include <vector>
#include "device_base.hpp"
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
typename
ALayout
,
template
<
index_t
NumDimG
,
typename
B0Layout
,
index_t
NumDimM
,
typename
B1Layout
,
index_t
NumDimN
,
typename
CPermuteNumDims_G_M_Gemm1N
,
// Sequence<>
index_t
NumDimK
,
index_t
NumDimO
,
typename
ADataType
,
typename
ADataType
,
typename
B0DataType
,
typename
B0DataType
,
typename
B1DataType
,
typename
B1DataType
,
typename
CDataType
,
typename
CDataType
,
typename
Acc0BiasDataType
,
typename
Acc1BiasDataType
,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
typename
B0ElementwiseOperation
,
typename
B0ElementwiseOperation
,
typename
Acc0ElementwiseOperation
,
typename
Acc0ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
,
MaskingSpecialization
MaskingSpec
>
struct
DeviceBatchedGemmSoftmaxGemmPermute
:
public
BaseOperator
struct
DeviceBatchedGemmSoftmaxGemmPermute
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
static
constexpr
index_t
NumAcc0Bias
=
Acc0BiasDataType
::
Size
();
MakeArgumentPointer
(
const
void
*
p_a
,
static
constexpr
index_t
NumAcc1Bias
=
Acc1BiasDataType
::
Size
();
const
void
*
p_b0
,
const
void
*
p_b1
,
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
void
*
p_c
,
const
void
*
p_a
,
ck
::
index_t
M
,
const
void
*
p_b0
,
ck
::
index_t
N
,
const
void
*
p_b1
,
ck
::
index_t
K
,
void
*
p_c
,
ck
::
index_t
O
,
const
std
::
array
<
void
*
,
NumAcc0Bias
>
p_acc0_biases
,
ck
::
index_t
Batch
,
const
std
::
array
<
void
*
,
NumAcc1Bias
>
p_acc1_biases
,
std
::
vector
<
index_t
>
c_gs_ms_os_lengths
,
const
std
::
vector
<
index_t
>&
a_gs_ms_ks_lengths
,
std
::
vector
<
index_t
>
c_gs_ms_os_strides
,
const
std
::
vector
<
index_t
>&
a_gs_ms_ks_strides
,
ck
::
index_t
StrideA
,
const
std
::
vector
<
index_t
>&
b_gs_ns_ks_lengths
,
ck
::
index_t
StrideB0
,
const
std
::
vector
<
index_t
>&
b_gs_ns_ks_strides
,
ck
::
index_t
StrideB1
,
const
std
::
vector
<
index_t
>&
b1_gs_gemm1ns_gemm1ks_lengths
,
// b1_gs_os_ns_lengths
ck
::
index_t
BatchStrideA
,
const
std
::
vector
<
index_t
>&
b1_gs_gemm1ns_gemm1ks_strides
,
// b1_gs_os_ns_strides
ck
::
index_t
BatchStrideB0
,
const
std
::
vector
<
index_t
>&
c_gs_ms_gemm1ns_lengths
,
// c_gs_ms_os_lengths
ck
::
index_t
BatchStrideB1
,
const
std
::
vector
<
index_t
>&
c_gs_ms_gemm1ns_strides
,
// c_gs_ms_os_strides
AElementwiseOperation
a_element_op
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumAcc0Bias
>
acc0_biases_gs_ms_ns_lengths
,
B0ElementwiseOperation
b0_element_op
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumAcc0Bias
>
acc0_biases_gs_ms_ns_strides
,
Acc0ElementwiseOperation
acc0_element_op
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumAcc1Bias
>
B1ElementwiseOperation
b1_element_op
,
acc1_biases_gs_ms_gemm1ns_lengths
,
// acc1_biases_gs_ms_os_lengths
CElementwiseOperation
c_element_op
)
=
0
;
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumAcc1Bias
>
acc1_biases_gs_ms_gemm1ns_strides
,
// acc1_biases_gs_ms_os_strides
AElementwiseOperation
a_element_op
,
B0ElementwiseOperation
b0_element_op
,
Acc0ElementwiseOperation
acc0_element_op
,
B1ElementwiseOperation
b1_element_op
,
CElementwiseOperation
c_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
...
include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
View file @
24af0144
...
@@ -13,31 +13,36 @@ namespace ck {
...
@@ -13,31 +13,36 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
index_t
Rank
,
index_t
NumBatchNormReduceDim
>
template
<
index_t
Rank
,
index_t
NumBatchNormReduceDim
,
typename
YElementwiseOp
>
struct
DeviceBatchNormFwd
:
public
BaseOperator
struct
DeviceBatchNormFwd
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
Rank
>
xyLengths
,
const
std
::
array
<
index_t
,
Rank
>
xyLengths
,
const
std
::
array
<
index_t
,
Rank
>
xStrides
,
const
std
::
array
<
index_t
,
Rank
>
xStrides
,
const
std
::
array
<
index_t
,
Rank
>
yStrides
,
const
std
::
array
<
index_t
,
Rank
>
yStrides
,
const
std
::
array
<
int
,
NumBatchNormReduceDim
>
reduceDims
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarLengths
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarLengths
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnBiasStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnMeanVarStrides
,
const
void
*
p_x
,
const
void
*
p_x
,
const
void
*
bnScale
,
const
void
*
bnScale
,
const
void
*
bnBias
,
const
void
*
bnBias
,
double
epsilon
,
const
YElementwiseOp
y_elementwise_op
,
void
*
p_y
,
void
*
p_y
,
void
*
resultSaveMean
,
void
*
resultSaveInvVariance
,
double
exponentialAverageFactor
,
double
exponentialAverageFactor
,
void
*
resultRunningMean
,
void
*
resultRunningMean
,
void
*
resultRunningVariance
,
void
*
resultRunningVariance
)
=
0
;
double
epsilon
,
void
*
resultSaveMean
,
void
*
resultSaveInvVariance
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
index_t
Rank
,
index_t
NumBatchNormReduceDim
>
template
<
index_t
Rank
,
index_t
NumBatchNormReduceDim
,
typename
YElementwiseOp
>
using
DeviceBatchNormFwdPtr
=
std
::
unique_ptr
<
DeviceBatchNormFwd
<
Rank
,
NumBatchNormReduceDim
>>
;
using
DeviceBatchNormFwdPtr
=
std
::
unique_ptr
<
DeviceBatchNormFwd
<
Rank
,
NumBatchNormReduceDim
,
YElementwiseOp
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
View file @
24af0144
...
@@ -21,7 +21,9 @@ struct DeviceBatchNormInfer : public BaseOperator
...
@@ -21,7 +21,9 @@ struct DeviceBatchNormInfer : public BaseOperator
const
std
::
array
<
index_t
,
Rank
>
xStrides
,
const
std
::
array
<
index_t
,
Rank
>
xStrides
,
const
std
::
array
<
index_t
,
Rank
>
yStrides
,
const
std
::
array
<
index_t
,
Rank
>
yStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarLengths
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarLengths
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnBiasStrides
,
const
std
::
array
<
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnMeanVarStrides
,
const
void
*
p_x
,
const
void
*
p_x
,
const
void
*
bnScale
,
const
void
*
bnScale
,
const
void
*
bnBias
,
const
void
*
bnBias
,
...
...
include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataTypeTuple
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
,
typename
XElementwiseOperation
,
typename
YElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
struct
DeviceElementwiseNormalization
:
public
BaseOperator
{
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumInput
>
inStridesArray
,
const
std
::
vector
<
index_t
>
gammaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
yStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
epsilon
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
void
*
p_gamma
,
const
void
*
p_beta
,
void
*
p_y
,
XElementwiseOperation
x_elementwise_op
,
YElementwiseOperation
y_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
template
<
typename
InDataTypeTuple
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
,
typename
XElementwiseOperation
,
typename
YElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
using
DeviceElementwiseNormalizationPtr
=
std
::
unique_ptr
<
DeviceElementwiseNormalization
<
InDataTypeTuple
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
Rank
,
NumReduceDim
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
View file @
24af0144
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#pragma once
#pragma once
#include <
vector
>
#include <
array
>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
...
...
include/ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp
→
include/ck/tensor_operation/gpu/device/device_
grouped_
conv_bwd_weight.hpp
View file @
24af0144
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#pragma once
#pragma once
#include <
vector
>
#include <
array
>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
...
@@ -11,7 +11,7 @@ namespace ck {
...
@@ -11,7 +11,7 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
ck
::
index_t
N
um
DimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
InLayout
,
typename
WeiLayout
,
typename
WeiLayout
,
typename
OutLayout
,
typename
OutLayout
,
...
@@ -21,22 +21,23 @@ template <ck::index_t NumDimSpatial,
...
@@ -21,22 +21,23 @@ template <ck::index_t NumDimSpatial,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
>
typename
OutElementwiseOperation
>
struct
DeviceConvBwdWeight
:
public
BaseOperator
struct
Device
Grouped
ConvBwdWeight
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
MakeArgumentPointer
(
const
void
*
p_in
,
void
*
p_wei
,
void
*
p_wei
,
const
void
*
p_out
,
const
void
*
p_out
,
ck
::
index_t
G
,
ck
::
index_t
N
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
K
,
ck
::
index_t
C
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
,
InElementwiseOperation
in_element_op
,
InElementwiseOperation
in_element_op
,
WeiElementwiseOperation
wei_element_op
,
WeiElementwiseOperation
wei_element_op
,
OutElementwiseOperation
out_element_op
,
OutElementwiseOperation
out_element_op
,
...
...
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <array>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// Convolution Forward:
// input : input image A[G, N, C, Hi, Wi],
// input : weight B[G, K, C, Y, X],
// output : output image E[G, N, K, Ho, Wo]
// C = a_op(A) * b_op(B)
// E = cde_op(C, D0, D1, ...)
template
<
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InElementwiseOperation
,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
>
struct
DeviceGroupedConvFwd
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in
,
// input image
const
void
*
p_wei
,
// weight
void
*
p_out
,
// output image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
in_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
wei_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
wei_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
out_g_n_k_wos_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
InElementwiseOperation
&
in_element_op
,
const
WeiElementwiseOperation
&
wei_element_op
,
const
OutElementwiseOperation
&
out_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <functional>
#include <iostream>
#include <iterator>
#include <numeric>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
{
struct
ComputePtrOffsetOfStridedBatch
{
ComputePtrOffsetOfStridedBatch
(
index_t
BatchStrideA
,
index_t
BatchStrideB
,
index_t
BatchStrideC
)
:
BatchStrideA_
(
BatchStrideA
),
BatchStrideB_
(
BatchStrideB
),
BatchStrideC_
(
BatchStrideC
)
{
}
__host__
__device__
constexpr
long_index_t
GetAPtrOffset
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideA_
);
}
__host__
__device__
constexpr
long_index_t
GetBPtrOffset
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideB_
);
}
__host__
__device__
constexpr
long_index_t
GetCPtrOffset
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideC_
);
}
index_t
BatchStrideA_
;
index_t
BatchStrideB_
;
index_t
BatchStrideC_
;
};
/*
* \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
*
* \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
* given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
* strided batched, but we can easily extend to other layouts. The returned offset can be either \p
* index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
* limitations.
*
* \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
* returns the 2D index of the tile that it computes. \see
* GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
*
* \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
* tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
* descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
* device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
* DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
* pointer offset into \p ComputePtrOffsetOfStridedBatch.
*
* \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
* Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
* realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
*
*/
template
<
typename
GridwiseGemm
,
typename
ABDataType
,
typename
CDataType
,
typename
AGridDesc_K0_M0_M1_K1
,
typename
BGridDesc_K0_N0_N1_K1
,
typename
CGridDesc_M0_M10_M11_N0_N10_N11
,
typename
Block2CTileMap
,
typename
ComputePtrOffsetOfBatch
,
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_grouped_conv_fwd_dl
(
const
ABDataType
*
__restrict__
p_a_grid
,
const
ABDataType
*
__restrict__
p_b_grid
,
CDataType
*
__restrict__
p_c_grid
,
const
index_t
batch_count
,
const
AGridDesc_K0_M0_M1_K1
a_grid_desc_k0_m0_m1_k1
,
const
BGridDesc_K0_N0_N1_K1
b_grid_desc_k0_n0_n1_k1
,
const
CGridDesc_M0_M10_M11_N0_N10_N11
c_grid_desc_m0_m10_m11_n0_n10_n11
,
const
Block2CTileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
)));
const
long_index_t
b_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
)));
const
long_index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
g_idx
)));
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
__shared__
ABDataType
p_shared
[
shared_block_size
];
GridwiseGemm
::
Run
(
p_a_grid
+
a_batch_offset
,
p_b_grid
+
b_batch_offset
,
p_c_grid
+
c_batch_offset
,
p_shared
,
a_grid_desc_k0_m0_m1_k1
,
b_grid_desc_k0_n0_n1_k1
,
c_grid_desc_m0_m10_m11_n0_n10_n11
,
block_2_ctile_map
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
#else
ignore
=
p_a_grid
;
ignore
=
p_b_grid
;
ignore
=
p_c_grid
;
ignore
=
batch_count
;
ignore
=
a_grid_desc_k0_m0_m1_k1
;
ignore
=
b_grid_desc_k0_n0_n1_k1
;
ignore
=
c_grid_desc_m0_m10_m11_n0_n10_n11
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
block_2_ctile_map
;
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
0
);
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
0
);
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
0
);
#endif
}
}
// namespace
//
// @brief Device Convolution operation.
//
// Supports:
// @li Forward convolution with up to 3 spatial dimentions
// @li Input tensor in GNWC data format
// @li Weight tensor in GKXC data format
// @li Output tensor in GNWK data format
//
// 1D:
// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
// 2D:
// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
// 3D:
// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
//
template
<
index_t
NDimSpatial
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
ConvolutionForwardSpecialization
ConvForwardSpecialization
,
GemmSpecialization
GemmSpec
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
K0PerBlock
,
index_t
K1
,
index_t
M1PerThread
,
index_t
N1PerThread
,
index_t
KPerThread
,
typename
M1N1ThreadClusterM1Xs
,
typename
M1N1ThreadClusterN1Xs
,
typename
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
typename
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
typename
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
typename
ABlockTransferSrcVectorTensorContiguousDimOrder
,
typename
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
typename
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
typename
BBlockTransferSrcAccessOrder
,
typename
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
typename
BBlockTransferSrcVectorTensorContiguousDimOrder
,
typename
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
enable_if_t
<
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
CElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
bool
>
=
false
>
struct
DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
:
public
DeviceGroupedConvFwd
<
NDimSpatial
,
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
using
DeviceOp
=
DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
conv_to_gemm_transformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
{};
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
K0PerBlock
};
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor_AK0_M_AK1
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
ALay
>(
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
);
const
auto
in_gemmm_gemmk_desc
=
matrix_padder
.
PadADescriptor_M_K
(
in_gemmmraw_gemmkraw_desc
);
const
auto
M
=
in_gemmm_gemmk_desc
.
GetLength
(
I0
);
const
auto
K
=
in_gemmm_gemmk_desc
.
GetLength
(
I1
);
const
auto
AK0
=
K
/
K1
;
return
transform_tensor_descriptor
(
in_gemmm_gemmk_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
AK0
,
K1
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor_BK0_N_BK1
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>(
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
);
const
auto
wei_gemmn_gemmk_desc
=
matrix_padder
.
PadBDescriptor_N_K
(
wei_gemmnraw_gemmkraw_desc
);
const
auto
N
=
wei_gemmn_gemmk_desc
.
GetLength
(
I0
);
const
auto
K
=
wei_gemmn_gemmk_desc
.
GetLength
(
I1
);
const
auto
BK0
=
K
/
K1
;
return
transform_tensor_descriptor
(
wei_gemmn_gemmk_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
K1
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
template
<
typename
CLay
>
static
auto
MakeCGridDescriptor_M_N
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_strides
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
conv_to_gemm_transformer
.
template
MakeCDescriptor_M_N
<
CLay
>(
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
);
const
auto
out_gemmm_gemmn_desc
=
matrix_padder
.
PadCDescriptor_M_N
(
out_gemmmraw_gemmnraw_desc
);
return
out_gemmm_gemmn_desc
;
}
// desc for problem definition
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
decltype
(
MakeAGridDescriptor_AK0_M_AK1
<
ALayout
>
({},
{},
{},
{},
{},
{},
{},
{},
{},
{}))
>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
decltype
(
MakeBGridDescriptor_BK0_N_BK1
<
BLayout
>
({},
{}))
>
;
using
CGridDesc_M_N
=
remove_cvref_t
<
decltype
(
MakeCGridDescriptor_M_N
<
CLayout
>
({},
{}))
>
;
// GridwiseGemm
using
GridwiseGemm
=
GridwiseGemmDl_km_kn_mn_v1r3
<
BlockSize
,
ADataType
,
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_AK0_M_AK1
,
BGridDesc_BK0_N_BK1
,
CGridDesc_M_N
,
MPerBlock
,
NPerBlock
,
K0PerBlock
,
K1
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM1Xs
,
M1N1ThreadClusterN1Xs
,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
>
;
using
AGridDesc_K0_M0_M1_K1
=
decltype
(
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
AGridDesc_AK0_M_AK1
{}));
using
BGridDesc_K0_N0_N1_K1
=
decltype
(
GridwiseGemm
::
MakeBGridDescriptor_K0_N0_N1_K1
(
BGridDesc_BK0_N_BK1
{}));
using
CGridDesc_M0_M10_M11_N0_N10_N11
=
decltype
(
GridwiseGemm
::
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
CGridDesc_M_N
{}));
using
DefaultBlock2CTileMap
=
decltype
(
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
CGridDesc_M_N
{}));
// Argument
struct
Argument
:
public
BaseArgument
{
Argument
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CElementwiseOperation
&
c_element_op
)
:
p_a_grid_
{
static_cast
<
const
ADataType
*>
(
p_a
)},
p_b_grid_
{
static_cast
<
const
BDataType
*>
(
p_b
)},
p_c_grid_
{
static_cast
<
CDataType
*>
(
p_c
)},
num_group_
{
a_g_n_c_wis_lengths
[
0
]},
a_grid_desc_ak0_m_ak1_
{
DeviceOp
::
MakeAGridDescriptor_AK0_M_AK1
<
ALayout
>
(
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
)},
b_grid_desc_bk0_n_bk1_
{
DeviceOp
::
MakeBGridDescriptor_BK0_N_BK1
<
BLayout
>
(
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
)},
c_grid_desc_m_n_
{
DeviceOp
::
MakeCGridDescriptor_M_N
<
CLayout
>
(
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
)},
a_grid_desc_k0_m0_m1_k1_
{},
b_grid_desc_k0_n0_n1_k1_
{},
c_grid_desc_m0_m10_m11_n0_n10_n11_
{},
block_2_ctile_map_
{},
compute_ptr_offset_of_batch_
{
a_g_n_c_wis_strides
[
0
],
b_g_k_c_xs_strides
[
0
],
c_g_n_k_wos_strides
[
0
]},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
},
a_g_n_c_wis_lengths_
{
a_g_n_c_wis_lengths
},
a_g_n_c_wis_strides_
{
a_g_n_c_wis_strides
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_strides_
{
b_g_k_c_xs_strides
},
c_g_n_k_wos_lengths_
{
c_g_n_k_wos_lengths
},
c_g_n_k_wos_strides_
{
c_g_n_k_wos_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
{
// A/B/E Batch Stride
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
a_g_n_c_wis_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideB_
=
b_g_k_c_xs_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideC_
=
c_g_n_k_wos_strides
[
0
];
// populate desc for Ds/E
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_ak0_m_ak1_
,
b_grid_desc_bk0_n_bk1_
,
c_grid_desc_m_n_
))
{
a_grid_desc_k0_m0_m1_k1_
=
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
a_grid_desc_ak0_m_ak1_
);
b_grid_desc_k0_n0_n1_k1_
=
GridwiseGemm
::
MakeBGridDescriptor_K0_N0_N1_K1
(
b_grid_desc_bk0_n_bk1_
);
c_grid_desc_m0_m10_m11_n0_n10_n11_
=
GridwiseGemm
::
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
c_grid_desc_m_n_
);
block_2_ctile_map_
=
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
);
}
}
void
Print
()
const
{
std
::
cout
<<
"A[K0, M, K1]: "
<<
a_grid_desc_ak0_m_ak1_
<<
std
::
endl
;
std
::
cout
<<
"B[K0, N, K1]: "
<<
b_grid_desc_bk0_n_bk1_
<<
std
::
endl
;
std
::
cout
<<
"C[M, N]: "
<<
c_grid_desc_m_n_
<<
std
::
endl
;
std
::
cout
<<
"num_group: "
<<
num_group_
<<
std
::
endl
;
std
::
cout
<<
"A[k0, m0, m1, k1]: "
<<
a_grid_desc_k0_m0_m1_k1_
<<
std
::
endl
;
std
::
cout
<<
"B[k0, n0, n1, k1]: "
<<
b_grid_desc_k0_n0_n1_k1_
<<
std
::
endl
;
std
::
cout
<<
"A[m0, m10, m11, n0, n10, n11]: "
<<
c_grid_desc_m0_m10_m11_n0_n10_n11_
<<
std
::
endl
;
}
// private:
// pointers
const
ADataType
*
p_a_grid_
;
const
BDataType
*
p_b_grid_
;
CDataType
*
p_c_grid_
;
// tensor descriptors for problem definiton
index_t
num_group_
;
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1_
;
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
// tensor descriptors for block/thread-wise copy
AGridDesc_K0_M0_M1_K1
a_grid_desc_k0_m0_m1_k1_
;
BGridDesc_K0_N0_N1_K1
b_grid_desc_k0_n0_n1_k1_
;
CGridDesc_M0_M10_M11_N0_N10_N11
c_grid_desc_m0_m10_m11_n0_n10_n11_
;
// block-to-e-tile map
DefaultBlock2CTileMap
block_2_ctile_map_
;
// for computing batch offset
ComputePtrOffsetOfStridedBatch
compute_ptr_offset_of_batch_
;
// element-wise op
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
// for checking IsSupportedArgument()
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
c_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
c_g_n_k_wos_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
};
// Invoker
struct
Invoker
:
public
BaseInvoker
{
using
Argument
=
DeviceOp
::
Argument
;
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
// if(stream_config.log_level_ > 0)
{
arg
.
Print
();
}
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
))
{
throw
std
::
runtime_error
(
"wrong! DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK has invalid setting"
);
}
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
),
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
))
*
arg
.
num_group_
;
auto
launch_kernel
=
[
&
](
auto
has_main_k_block_loop
,
auto
has_double_tail_k_block_loop
)
{
constexpr
bool
has_main_loop
=
has_main_k_block_loop
.
value
;
constexpr
bool
has_double_loop
=
has_double_tail_k_block_loop
;
const
auto
kernel
=
kernel_grouped_conv_fwd_dl
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
CDataType
,
DeviceOp
::
AGridDesc_K0_M0_M1_K1
,
DeviceOp
::
BGridDesc_K0_N0_N1_K1
,
DeviceOp
::
CGridDesc_M0_M10_M11_N0_N10_N11
,
DefaultBlock2CTileMap
,
ComputePtrOffsetOfStridedBatch
,
has_main_loop
,
has_double_loop
>
;
return
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_g_n_c_wis_lengths_
[
0
],
// Group count
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
,
arg
.
compute_ptr_offset_of_batch_
);
};
const
auto
K0
=
arg
.
a_grid_desc_k0_m0_m1_k1_
.
GetLength
(
I0
);
const
bool
has_main_k_block_loop
=
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K0
);
const
bool
has_double_tail_k_block_loop
=
GridwiseGemm
::
CalculateHasDoubleTailKBlockLoop
(
K0
);
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
return
launch_kernel
(
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
return
launch_kernel
(
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
return
launch_kernel
(
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
return
launch_kernel
(
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
namespace
ctc
=
tensor_layout
::
convolution
;
// check device
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
))
{
return
false
;
}
// check ConvolutionForwardSpecialization
if
constexpr
(
ConvForwardSpecialization
==
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
// check if it's 1x1, stride=1 conv
for
(
index_t
i
=
0
;
i
<
NDimSpatial
;
++
i
)
{
const
index_t
X
=
arg
.
b_g_k_c_xs_lengths_
[
i
+
3
];
const
index_t
ConvStride
=
arg
.
conv_filter_strides_
[
i
];
const
index_t
LeftPad
=
arg
.
input_left_pads_
[
i
];
const
index_t
RightPad
=
arg
.
input_right_pads_
[
i
];
if
(
!
(
X
==
1
&&
ConvStride
==
1
&&
LeftPad
==
0
&&
RightPad
==
0
))
{
std
::
cout
<<
"Filter1x1Stride1Pad0 check: i = "
<<
i
<<
" X = "
<<
X
<<
" ConvStride = "
<<
ConvStride
<<
" LeftPad = "
<<
LeftPad
<<
" RightPad = "
<<
RightPad
<<
std
::
endl
;
return
false
;
}
}
}
else
if
constexpr
(
ConvForwardSpecialization
==
ConvolutionForwardSpecialization
::
Filter1x1Pad0
)
{
// check if it's 1x1 conv
for
(
index_t
i
=
0
;
i
<
NDimSpatial
;
++
i
)
{
const
index_t
X
=
arg
.
b_g_k_c_xs_lengths_
[
i
+
3
];
const
index_t
LeftPad
=
arg
.
input_left_pads_
[
i
];
const
index_t
RightPad
=
arg
.
input_right_pads_
[
i
];
if
(
!
(
X
==
1
&&
LeftPad
==
0
&&
RightPad
==
0
))
{
std
::
cout
<<
"Filter1x1Stride1Pad0 check: i = "
<<
i
<<
" X = "
<<
X
<<
" LeftPad = "
<<
LeftPad
<<
" RightPad = "
<<
RightPad
<<
std
::
endl
;
return
false
;
}
}
}
// check vector access of A
// FIXME: layout
if
constexpr
(
is_same_v
<
ALayout
,
ctc
::
G_NW_C
>
||
is_same_v
<
ALayout
,
ctc
::
G_NHW_C
>
||
is_same_v
<
ALayout
,
ctc
::
G_NDHW_C
>
||
is_same_v
<
ALayout
,
ctc
::
GNWC
>
||
is_same_v
<
ALayout
,
ctc
::
GNHWC
>
||
is_same_v
<
ALayout
,
ctc
::
GNDHWC
>
||
is_same_v
<
ALayout
,
ctc
::
NWGC
>
||
is_same_v
<
ALayout
,
ctc
::
NHWGC
>
||
is_same_v
<
ALayout
,
ctc
::
NDHWGC
>
)
{
auto
srcVectorLengths
=
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
{};
if
(
srcVectorLengths
[
I1
]
!=
1
||
srcVectorLengths
[
I2
]
!=
1
)
{
return
false
;
}
if
(
K1
%
srcVectorLengths
[
I3
]
!=
0
||
K0PerBlock
%
srcVectorLengths
[
I0
]
!=
0
)
{
return
false
;
}
const
index_t
C
=
arg
.
a_g_n_c_wis_lengths_
[
2
];
if
(
C
%
(
srcVectorLengths
[
I0
]
*
srcVectorLengths
[
I3
])
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
// check vector access of B
// FIXME: layout
if
constexpr
(
is_same_v
<
BLayout
,
ctc
::
G_K_X_C
>
||
is_same_v
<
BLayout
,
ctc
::
G_K_YX_C
>
||
is_same_v
<
BLayout
,
ctc
::
G_K_ZYX_C
>
||
is_same_v
<
BLayout
,
ctc
::
GKXC
>
||
is_same_v
<
BLayout
,
ctc
::
GKYXC
>
||
is_same_v
<
BLayout
,
ctc
::
GKZYXC
>
||
is_same_v
<
BLayout
,
ctc
::
KXGC
>
||
is_same_v
<
BLayout
,
ctc
::
KYXGC
>
||
is_same_v
<
BLayout
,
ctc
::
KZYXGC
>
)
{
auto
srcVectorLengths
=
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
{};
if
(
srcVectorLengths
[
I1
]
!=
1
||
srcVectorLengths
[
I2
]
!=
1
)
{
return
false
;
}
if
(
K1
%
srcVectorLengths
[
I3
]
!=
0
||
K0PerBlock
%
srcVectorLengths
[
I0
]
!=
0
)
{
return
false
;
}
const
index_t
C
=
arg
.
b_g_k_c_xs_lengths_
[
2
];
if
(
C
%
(
srcVectorLengths
[
I0
]
*
srcVectorLengths
[
I3
])
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
// check vector access of C
if
constexpr
(
is_same_v
<
CLayout
,
ctc
::
G_NW_K
>
||
is_same_v
<
CLayout
,
ctc
::
G_NHW_K
>
||
is_same_v
<
CLayout
,
ctc
::
G_NDHW_K
>
||
is_same_v
<
CLayout
,
ctc
::
GNWK
>
||
is_same_v
<
CLayout
,
ctc
::
GNHWK
>
||
is_same_v
<
CLayout
,
ctc
::
GNDHWK
>
||
is_same_v
<
CLayout
,
ctc
::
NWGK
>
||
is_same_v
<
CLayout
,
ctc
::
NHWGK
>
||
is_same_v
<
CLayout
,
ctc
::
NDHWGK
>
)
{
const
index_t
K
=
arg
.
c_g_n_k_wos_lengths_
[
2
];
if
(
!
(
K
%
CThreadTransferDstScalarPerVector
==
0
&&
CThreadTransferSrcDstVectorDim
==
5
))
{
return
false
;
}
}
else
{
return
false
;
}
// check Gridwise GEMM
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
);
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CElementwiseOperation
&
c_element_op
)
{
return
Argument
{
p_a
,
p_b
,
p_c
,
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
a_element_op
,
b_element_op
,
c_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CElementwiseOperation
&
c_element_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
p_a
,
p_b
,
p_c
,
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
a_element_op
,
b_element_op
,
c_element_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK"
<<
"<"
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
K0PerBlock
<<
", "
<<
getConvForwardSpecializationString
(
ConvForwardSpecialization
)
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
View file @
24af0144
...
@@ -7,46 +7,50 @@
...
@@ -7,46 +7,50 @@
#include <vector>
#include <vector>
#include "device_base.hpp"
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
typename
ALayout
,
template
<
index_t
NumDimG
,
typename
B0Layout
,
index_t
NumDimM
,
typename
B1Layout
,
index_t
NumDimN
,
typename
CPermuteNumDims_G_M_Gemm1N
,
// Sequence<>
index_t
NumDimK
,
index_t
NumDimO
,
typename
ADataType
,
typename
ADataType
,
typename
B0DataType
,
typename
B0DataType
,
typename
B1DataType
,
typename
B1DataType
,
typename
CDataType
,
typename
CDataType
,
typename
Acc0BiasDataType
,
typename
Acc1BiasDataType
,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
typename
B0ElementwiseOperation
,
typename
B0ElementwiseOperation
,
typename
Acc0ElementwiseOperation
,
typename
Acc0ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
,
MaskingSpecialization
MaskingSpec
>
struct
DeviceGroupedGemmSoftmaxGemmPermute
:
public
BaseOperator
struct
DeviceGroupedGemmSoftmaxGemmPermute
:
public
BaseOperator
{
{
struct
ProblemDesc
struct
ProblemDesc
{
{
// Overall problem shape
std
::
vector
<
index_t
>
a_gs_ms_ks_lengths
;
index_t
M
;
std
::
vector
<
index_t
>
a_gs_ms_ks_strides
;
index_t
N
;
index_t
K
;
index_t
O
;
index_t
Batch
;
// Stride for A/B0/B1; layout determined by template args
std
::
vector
<
index_t
>
b0_gs_ns_ks_lengths
;
index_t
StrideA
;
std
::
vector
<
index_t
>
b0_gs_ns_ks_strides
;
index_t
StrideB0
;
index_t
StrideB1
;
std
::
vector
<
index_t
>
b1_gs_os_ns_lengths
;
index_t
BatchStrideA
;
std
::
vector
<
index_t
>
b1_gs_os_ns_strides
;
index_t
BatchStrideB0
;
index_t
BatchStrideB1
;
// Lengths and strides for output C
std
::
vector
<
index_t
>
c_gs_ms_os_lengths
;
std
::
vector
<
index_t
>
c_gs_ms_os_lengths
;
std
::
vector
<
index_t
>
c_gs_ms_os_strides
;
std
::
vector
<
index_t
>
c_gs_ms_os_strides
;
std
::
vector
<
std
::
vector
<
index_t
>>
acc0_biases_gs_ms_ns_lengths
;
std
::
vector
<
std
::
vector
<
index_t
>>
acc0_biases_gs_ms_ns_strides
;
std
::
vector
<
std
::
vector
<
index_t
>>
acc1_biases_gs_ms_os_lengths
;
std
::
vector
<
std
::
vector
<
index_t
>>
acc1_biases_gs_ms_os_strides
;
};
};
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
...
@@ -54,6 +58,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute : public BaseOperator
...
@@ -54,6 +58,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute : public BaseOperator
std
::
vector
<
const
void
*>
p_b0_vec
,
std
::
vector
<
const
void
*>
p_b0_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc0_biases_vec
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc1_biases_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
B0ElementwiseOperation
b0_element_op
,
B0ElementwiseOperation
b0_element_op
,
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
View file @
24af0144
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
@@ -54,9 +55,8 @@ __global__ void
...
@@ -54,9 +55,8 @@ __global__ void
index_t
right
=
group_count
;
index_t
right
=
group_count
;
index_t
group_id
=
index_t
((
left
+
right
)
/
2
);
index_t
group_id
=
index_t
((
left
+
right
)
/
2
);
while
((
!
(
block_id
>=
arg_ptr
[
group_id
].
block_start_
&&
while
(
block_id
<
arg_ptr
[
group_id
].
block_end_
))
&&
(
!
(
block_id
>=
arg_ptr
[
group_id
].
block_start_
&&
block_id
<
arg_ptr
[
group_id
].
block_end_
)))
left
<=
right
)
{
{
if
(
block_id
<
arg_ptr
[
group_id
].
block_start_
)
if
(
block_id
<
arg_ptr
[
group_id
].
block_start_
)
{
{
...
@@ -114,14 +114,17 @@ __global__ void
...
@@ -114,14 +114,17 @@ __global__ void
// Computes C = A * B0 * B1
// Computes C = A * B0 * B1
// ^^^^^^ (Acc0)
// ^^^^^^ (Acc0)
// ^^^^^^^^^^^ (Acc1)
// ^^^^^^^^^^^ (Acc1)
template
<
typename
ALayout
,
template
<
index_t
NumDimG
,
typename
BLayout
,
// B0Layout
index_t
NumDimM
,
typename
B1Layout
,
index_t
NumDimN
,
typename
CPermuteNumDims_G_M_Gemm1N
,
// Sequence<NumDimG, NumDimM, NumDimGemm1N>
index_t
NumDimK
,
index_t
NumDimO
,
// NumDimGemm1N
typename
ADataType
,
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
B1DataType
,
typename
B1DataType
,
typename
CDataType
,
typename
CDataType
,
typename
Acc0BiasDataType
,
typename
Acc1BiasDataType
,
typename
GemmAccDataType
,
typename
GemmAccDataType
,
typename
CShuffleDataType
,
typename
CShuffleDataType
,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
...
@@ -130,6 +133,10 @@ template <typename ALayout,
...
@@ -130,6 +133,10 @@ template <typename ALayout,
typename
B1ElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
CElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
GemmSpec
,
GemmSpecialization
GemmSpec
,
TensorSpecialization
ASpec
,
TensorSpecialization
BSpec
,
TensorSpecialization
B1Spec
,
TensorSpecialization
CSpec
,
index_t
NumGemmKPrefetchStage
,
index_t
NumGemmKPrefetchStage
,
index_t
BlockSize
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
MPerBlock
,
...
@@ -170,297 +177,152 @@ template <typename ALayout,
...
@@ -170,297 +177,152 @@ template <typename ALayout,
index_t
CShuffleNXdlPerWavePerShuffle
,
index_t
CShuffleNXdlPerWavePerShuffle
,
typename
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CShuffleBlockTransferScalarPerVector_NPerBlock
,
index_t
CShuffleBlockTransferScalarPerVector_NPerBlock
,
bool
MaskOutUpperTriangle
,
MaskingSpecialization
MaskingSpec
,
LoopScheduler
LoopSched
=
LoopScheduler
::
Default
>
LoopScheduler
LoopSched
=
LoopScheduler
::
Default
>
struct
DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
struct
DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
:
public
DeviceGroupedGemmSoftmaxGemmPermute
<
ALayout
,
:
public
DeviceGroupedGemmSoftmaxGemmPermute
<
NumDimG
,
BLayout
,
NumDimM
,
B1Layout
,
NumDimN
,
CPermuteNumDims_G_M_Gemm1N
,
NumDimK
,
NumDimO
,
ADataType
,
ADataType
,
BDataType
,
BDataType
,
B1DataType
,
B1DataType
,
CDataType
,
CDataType
,
Acc0BiasDataType
,
Acc1BiasDataType
,
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
B1ElementwiseOperation
,
B1ElementwiseOperation
,
CElementwiseOperation
>
CElementwiseOperation
,
MaskingSpec
>
{
{
using
DeviceOp
=
DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
;
static_assert
(
NumDimG
>
0
&&
NumDimM
>
0
&&
NumDimN
>
0
&&
NumDimK
>
0
&&
NumDimO
>
0
,
using
ProblemDesc
=
"Number of dimension must be greater than 0"
);
typename
DeviceGroupedGemmSoftmaxGemmPermute
<
ALayout
,
BLayout
,
static
constexpr
index_t
NumAcc0Bias
=
Acc0BiasDataType
::
Size
();
B1Layout
,
static
constexpr
index_t
NumAcc1Bias
=
Acc1BiasDataType
::
Size
();
CPermuteNumDims_G_M_Gemm1N
,
ADataType
,
// TODO ANT: implement bias combination
BDataType
,
static_assert
(
NumAcc0Bias
==
0
&&
NumAcc0Bias
==
0
,
"Bias addition is unimplemented"
);
B1DataType
,
CDataType
,
#if 0
AElementwiseOperation
,
// TODO ANT: use alias
BElementwiseOperation
,
static constexpr index_t NumDimGemm0M = NumDimM;
AccElementwiseOperation
,
static constexpr index_t NumDimGemm0N = NumDimN;
B1ElementwiseOperation
,
static constexpr index_t NumDimGemm0K = NumDimK;
CElementwiseOperation
>::
ProblemDesc
;
static constexpr index_t NumDimGemm1M = NumDimM;
static constexpr index_t NumDimGemm1N = NumDimO;
static constexpr index_t NumDimGemm1K = NumDimN;
#endif
using
DeviceOp
=
DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
;
using
ProblemDesc
=
typename
DeviceGroupedGemmSoftmaxGemmPermute
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
ADataType
,
BDataType
,
B1DataType
,
CDataType
,
Acc0BiasDataType
,
Acc1BiasDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
AccElementwiseOperation
,
B1ElementwiseOperation
,
CElementwiseOperation
,
MaskingSpec
>::
ProblemDesc
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
matrix_padder
=
using
Transform
=
TransformBatchedContractionContractionToBatchedGemmGemm
<
GemmGemmPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
,
index_t
>
{
Sequence
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
>
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
Gemm1NPerBlock
};
Sequence
<
MPerBlock
,
NPerBlock
,
KPerBlock
,
Gemm1NPerBlock
>
,
GemmSpec
,
static
auto
MakeAGridDescriptor_AK0_M_AK1
(
index_t
MRaw
,
index_t
KRaw
,
index_t
StrideA
)
ASpec
,
{
BSpec
,
const
auto
a_grid_desc_mraw_kraw
=
[
&
]()
{
B1Spec
,
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>
)
CSpec
>
;
{
return
make_naive_tensor_descriptor
(
make_tuple
(
MRaw
,
KRaw
),
static
auto
MakeAGridDescriptor_AK0_M_AK1
(
const
std
::
vector
<
index_t
>&
a_gs_ms_ks_lengths_vec
,
make_tuple
(
StrideA
,
I1
));
const
std
::
vector
<
index_t
>&
a_gs_ms_ks_strides_vec
)
}
else
if
constexpr
(
is_same_v
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
MRaw
,
KRaw
),
make_tuple
(
I1
,
StrideA
));
}
}();
const
auto
a_grid_desc_m_k
=
matrix_padder
.
PadADescriptor_M_K
(
a_grid_desc_mraw_kraw
);
const
auto
M
=
a_grid_desc_m_k
.
GetLength
(
I0
);
const
auto
K
=
a_grid_desc_m_k
.
GetLength
(
I1
);
const
auto
AK0
=
K
/
AK1
;
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
static
auto
MakeBGridDescriptor_BK0_N_BK1
(
index_t
KRaw
,
index_t
NRaw
,
index_t
StrideB
)
{
const
auto
b_grid_desc_nraw_kraw
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
I1
,
StrideB
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
StrideB
,
I1
));
}
}();
const
auto
b_grid_desc_n_k
=
matrix_padder
.
PadBDescriptor_N_K
(
b_grid_desc_nraw_kraw
);
const
auto
N
=
b_grid_desc_n_k
.
GetLength
(
I0
);
const
auto
K
=
b_grid_desc_n_k
.
GetLength
(
I1
);
const
auto
BK0
=
K
/
BK1
;
return
transform_tensor_descriptor
(
b_grid_desc_n_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
// Args: Gemm1KRaw, Gemm1NRaw, StrideB1
static
auto
MakeB1GridDescriptor_BK0_N_BK1
(
index_t
KRaw
,
index_t
NRaw
,
index_t
StrideB
)
{
{
const
auto
b1_grid_desc_nraw_kraw
=
[
&
]()
{
return
Transform
::
MakeAGridDescriptor_AK0_M_AK1
(
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
B1Layout
>::
value
)
Transform
::
MakeAGridDescriptor_M_K
(
a_gs_ms_ks_lengths_vec
,
a_gs_ms_ks_strides_vec
),
{
Number
<
AK1
>
{});
return
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
I1
,
StrideB
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
B1Layout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
StrideB
,
I1
));
}
}();
const
auto
b1_grid_desc_n_k
=
matrix_padder
.
PadB1Descriptor_N_K
(
b1_grid_desc_nraw_kraw
);
const
auto
N
=
b1_grid_desc_n_k
.
GetLength
(
I0
);
const
auto
K
=
b1_grid_desc_n_k
.
GetLength
(
I1
);
const
auto
B1K0
=
K
/
B1K1
;
return
transform_tensor_descriptor
(
b1_grid_desc_n_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
B1K0
,
B1K1
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
}
// assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
static
auto
MakeBGridDescriptor_BK0_N_BK1
(
const
std
::
vector
<
index_t
>&
b_gs_ns_ks_lengths_vec
,
static
auto
MakeCGridDescriptor_M_N
(
const
std
::
vector
<
index_t
>&
c_gs_ms_ns_lengths_vec
,
const
std
::
vector
<
index_t
>&
b_gs_ns_ks_strides_vec
)
const
std
::
vector
<
index_t
>&
c_gs_ms_ns_strides_vec
)
{
{
constexpr
index_t
NumDimG
=
CPermuteNumDims_G_M_Gemm1N
::
At
(
I0
);
return
Transform
::
MakeB0GridDescriptor_BK0_N_BK1
(
constexpr
index_t
NumDimM
=
CPermuteNumDims_G_M_Gemm1N
::
At
(
I1
);
Transform
::
MakeB0GridDescriptor_N_K
(
b_gs_ns_ks_lengths_vec
,
b_gs_ns_ks_strides_vec
),
constexpr
index_t
NumDimN
=
CPermuteNumDims_G_M_Gemm1N
::
At
(
I2
);
// NumDimGemm1N
Number
<
BK1
>
{});
assert
(
c_gs_ms_ns_lengths_vec
.
size
()
==
NumDimG
+
NumDimM
+
NumDimN
&&
c_gs_ms_ns_strides_vec
.
size
()
==
NumDimG
+
NumDimM
+
NumDimN
);
const
auto
to_tuple
=
[
&
](
auto
&
vec
,
auto
start
,
auto
end
)
{
return
generate_tuple
([
&
](
auto
i
)
{
return
vec
[
start
+
i
];
},
Number
<
end
-
start
>
{});
};
const
auto
c_ms_ns_lengths
=
to_tuple
(
c_gs_ms_ns_lengths_vec
,
Number
<
NumDimG
>
{},
Number
<
NumDimG
+
NumDimM
+
NumDimN
>
{});
const
auto
c_ms_ns_strides
=
to_tuple
(
c_gs_ms_ns_strides_vec
,
Number
<
NumDimG
>
{},
Number
<
NumDimG
+
NumDimM
+
NumDimN
>
{});
// dimension Ids for M0, M1, ...
constexpr
auto
mDimIds
=
typename
arithmetic_sequence_gen
<
0
,
NumDimM
,
1
>::
type
{};
// dimension Ids for N0, N1, ...
constexpr
auto
nDimIds
=
typename
arithmetic_sequence_gen
<
NumDimM
,
NumDimM
+
NumDimN
,
1
>::
type
{};
// lengths for M0, M1, ...
const
auto
mLengths
=
get_container_subset
(
c_ms_ns_lengths
,
mDimIds
);
// lengths for K0, K1, ...
const
auto
nLengths
=
get_container_subset
(
c_ms_ns_lengths
,
nDimIds
);
// naive tensor C[M0, M1, M2, ..., N0, N1, N2...]
const
auto
c_grid_desc_ms_ns
=
make_naive_tensor_descriptor
(
c_ms_ns_lengths
,
c_ms_ns_strides
);
// transformed tensor C[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
const
auto
c_grid_desc_mraw_nraw
=
transform_tensor_descriptor
(
c_grid_desc_ms_ns
,
make_tuple
(
make_merge_transform
(
mLengths
),
make_merge_transform
(
nLengths
)),
make_tuple
(
mDimIds
,
nDimIds
),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
matrix_padder
.
PadCDescriptor_M_N
(
c_grid_desc_mraw_nraw
);
}
}
// assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
static
auto
static
auto
Make
C
GridDescriptor_
G_M_N
(
const
std
::
vector
<
index_t
>&
c
_gs_
ms_n
s_lengths_vec
,
Make
B1
GridDescriptor_
BK0_N_BK1
(
const
std
::
vector
<
index_t
>&
b1
_gs_
gemm1ns_gemm1k
s_lengths_vec
,
const
std
::
vector
<
index_t
>&
c
_gs_
ms_n
s_strides_vec
)
const
std
::
vector
<
index_t
>&
b1
_gs_
gemm1ns_gemm1k
s_strides_vec
)
{
{
constexpr
index_t
NumDimG
=
CPermuteNumDims_G_M_Gemm1N
::
At
(
I0
);
return
Transform
::
MakeB1GridDescriptor_BK0_N_BK1
(
constexpr
index_t
NumDimM
=
CPermuteNumDims_G_M_Gemm1N
::
At
(
I1
);
Transform
::
MakeB1GridDescriptor_N_K
(
b1_gs_gemm1ns_gemm1ks_lengths_vec
,
constexpr
index_t
NumDimN
=
CPermuteNumDims_G_M_Gemm1N
::
At
(
I2
);
// NumDimGemm1N
b1_gs_gemm1ns_gemm1ks_strides_vec
),
Number
<
B1K1
>
{});
assert
(
c_gs_ms_ns_lengths_vec
.
size
()
==
NumDimG
+
NumDimM
+
NumDimN
&&
c_gs_ms_ns_strides_vec
.
size
()
==
NumDimG
+
NumDimM
+
NumDimN
);
const
auto
to_tuple
=
[
&
](
auto
&
vec
,
auto
start
,
auto
end
)
{
return
generate_tuple
([
&
](
auto
i
)
{
return
vec
[
start
+
i
];
},
Number
<
end
-
start
>
{});
};
const
auto
c_gs_ms_ns_lengths
=
to_tuple
(
c_gs_ms_ns_lengths_vec
,
Number
<
0
>
{},
Number
<
NumDimG
+
NumDimM
+
NumDimN
>
{});
const
auto
c_gs_ms_ns_strides
=
to_tuple
(
c_gs_ms_ns_strides_vec
,
Number
<
0
>
{},
Number
<
NumDimG
+
NumDimM
+
NumDimN
>
{});
// dimension Ids for G0, G1, ...
constexpr
auto
gDimIds
=
typename
arithmetic_sequence_gen
<
0
,
NumDimG
,
1
>::
type
{};
// dimension Ids for M0, M1, ...
constexpr
auto
mDimIds
=
typename
arithmetic_sequence_gen
<
NumDimG
,
NumDimG
+
NumDimM
,
1
>::
type
{};
// dimension Ids for N0, N1, ...
constexpr
auto
nDimIds
=
typename
arithmetic_sequence_gen
<
NumDimG
+
NumDimM
,
NumDimG
+
NumDimM
+
NumDimN
,
1
>::
type
{};
// lengths for G0, G1, ...
const
auto
gLengths
=
get_container_subset
(
c_gs_ms_ns_lengths
,
gDimIds
);
// lengths for M0, M1, ...
const
auto
mLengths
=
get_container_subset
(
c_gs_ms_ns_lengths
,
mDimIds
);
// lengths for K0, K1, ...
const
auto
nLengths
=
get_container_subset
(
c_gs_ms_ns_lengths
,
nDimIds
);
// naive tensor C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
const
auto
c_grid_desc_gs_ms_ns
=
make_naive_tensor_descriptor
(
c_gs_ms_ns_lengths
,
c_gs_ms_ns_strides
);
// transformed tensor C[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
// N2 * ...]
const
auto
c_grid_desc_g_mraw_nraw
=
transform_tensor_descriptor
(
c_grid_desc_gs_ms_ns
,
make_tuple
(
make_merge_transform
(
gLengths
),
make_merge_transform
(
mLengths
),
make_merge_transform
(
nLengths
)),
make_tuple
(
gDimIds
,
mDimIds
,
nDimIds
),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
// this desc is only for calculating batch offset so no padding needed
return
c_grid_desc_g_mraw_nraw
;
}
}
using
AGridDesc_AK0_M_AK1
=
decltype
(
MakeAGridDescriptor_AK0_M_AK1
(
1
,
1
,
1
));
using
AGridDesc_AK0_M_AK1
=
decltype
(
MakeAGridDescriptor_AK0_M_AK1
({},
{}));
using
BGridDesc_BK0_N_BK1
=
decltype
(
MakeBGridDescriptor_BK0_N_BK1
(
1
,
1
,
1
));
using
BGridDesc_BK0_N_BK1
=
decltype
(
MakeBGridDescriptor_BK0_N_BK1
({},
{}));
using
B1GridDesc_BK0_N_BK1
=
decltype
(
MakeB1GridDescriptor_BK0_N_BK1
(
1
,
1
,
1
));
using
B1GridDesc_BK0_N_BK1
=
decltype
(
MakeB1GridDescriptor_BK0_N_BK1
({},
{}));
using
CGridDesc_M_N
=
decltype
(
MakeCGridDescriptor_M_N
({},
{}));
using
CGridDesc_M_N
=
decltype
(
Transform
::
MakeCGridDescriptor_M_N
({},
{}));
using
CGridDesc_G_M_N
=
decltype
(
MakeCGridDescriptor_G_M_N
({},
{}));
using
AGridDesc_G_M_K
=
decltype
(
Transform
::
MakeAGridDescriptor_G_M_K
({},
{}));
using
BGridDesc_G_N_K
=
decltype
(
Transform
::
MakeB0GridDescriptor_G_N_K
({},
{}));
using
B1GridDesc_G_N_K
=
decltype
(
Transform
::
MakeB1GridDescriptor_G_N_K
({},
{}));
using
CGridDesc_G_M_N
=
decltype
(
Transform
::
MakeCGridDescriptor_G_M_N
({},
{}));
// to track the points which need to be set to -inf on C0
constexpr
static
auto
make_MaskOutPredicate
()
// Note: no need to reset M padding value, because they will not be stored out.
struct
C0MatrixMask
{
{
C0MatrixMask
(
index_t
NRaw
)
:
NRaw_
(
NRaw
)
{}
if
constexpr
(
MaskingSpec
==
MaskingSpecialization
::
MaskDisabled
)
__host__
__device__
bool
IsUpperTriangle
(
index_t
m
,
index_t
n
)
const
{
return
n
>
m
;
}
__host__
__device__
bool
IsNOutOfBound
(
/*index_t m, */
index_t
n
)
const
{
{
return
n
>=
NRaw_
;
return
MaskDisabledPredicate
{}
;
}
}
else
if
constexpr
(
MaskingSpec
==
MaskingSpecialization
::
MaskOutUpperTriangle
)
__host__
__device__
bool
IsMaskedElement
(
index_t
m
,
index_t
n
)
const
{
{
return
Is
UpperTriangle
(
m
,
n
)
||
IsNOutOfBound
(
n
)
;
return
MaskOut
UpperTriangle
Predicate
{}
;
}
}
}
private:
using
C0MatrixMask
=
C0MatrixMask_impl
<
decltype
(
make_MaskOutPredicate
())
>
;
// index_t MRaw_;
index_t
NRaw_
;
};
struct
ComputeBasePtrOfStridedBatch
struct
ComputeBasePtrOfStridedBatch
{
{
ComputeBasePtrOfStridedBatch
(
index_t
BatchStrideA
,
ComputeBasePtrOfStridedBatch
(
const
AGridDesc_G_M_K
&
a_grid_desc_g_m_k
,
index_t
BatchStrideB
,
const
BGridDesc_G_N_K
&
b_grid_desc_g_n_k
,
index_t
BatchStrideB1
,
const
B1GridDesc_G_N_K
&
b1_grid_desc_g_n_k
,
CGridDesc_G_M_N
c_grid_desc_g_m_n
)
const
CGridDesc_G_M_N
&
c_grid_desc_g_m_n
)
:
BatchStrideA_
(
BatchStrideA
),
:
a_grid_desc_g_m_k_
(
a_grid_desc_g_m_k
),
BatchStrideB_
(
BatchStrideB
),
b_grid_desc_g_n_k_
(
b_grid_desc_g_n_k
),
BatchStrideB1_
(
BatchStrideB1
),
b1_grid_desc_g_n_k_
(
b1_grid_desc_g_n_k
),
c_grid_desc_g_m_n_
(
c_grid_desc_g_m_n
)
c_grid_desc_g_m_n_
(
c_grid_desc_g_m_n
)
{
{
}
}
__host__
__device__
constexpr
long_index_t
GetABasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetABasePtr
(
index_t
g_idx
)
const
{
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideA_
);
return
a_grid_desc_g_m_k_
.
CalculateOffset
(
make_multi_index
(
g_idx
,
0
,
0
)
);
}
}
__host__
__device__
constexpr
long_index_t
GetBBasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetBBasePtr
(
index_t
g_idx
)
const
{
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideB_
);
return
b_grid_desc_g_n_k_
.
CalculateOffset
(
make_multi_index
(
g_idx
,
0
,
0
)
);
}
}
__host__
__device__
constexpr
long_index_t
GetB1BasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetB1BasePtr
(
index_t
g_idx
)
const
{
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideB1_
);
return
b1_grid_desc_g_n_k_
.
CalculateOffset
(
make_multi_index
(
g_idx
,
0
,
0
)
);
}
}
__host__
__device__
constexpr
long_index_t
GetCBasePtr
(
index_t
g_idx
)
const
__host__
__device__
constexpr
long_index_t
GetCBasePtr
(
index_t
g_idx
)
const
...
@@ -469,9 +331,9 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -469,9 +331,9 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
}
}
private:
private:
index_t
BatchStrideA
_
;
AGridDesc_G_M_K
a_grid_desc_g_m_k
_
;
index_t
BatchStrideB
_
;
BGridDesc_G_N_K
b_grid_desc_g_n_k
_
;
index_t
BatchStrideB1
_
;
B1GridDesc_G_N_K
b1_grid_desc_g_n_k
_
;
CGridDesc_G_M_N
c_grid_desc_g_m_n_
;
CGridDesc_G_M_N
c_grid_desc_g_m_n_
;
};
};
...
@@ -535,8 +397,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -535,8 +397,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
CShuffleBlockTransferScalarPerVector_NPerBlock
,
CShuffleBlockTransferScalarPerVector_NPerBlock
,
LoopSched
,
LoopSched
,
matrix_padder
.
PadN
,
Transform
::
matrix_padder
.
PadN
,
MaskOutUpperTriangle
>
;
MaskingSpec
==
MaskingSpecialization
::
MaskOutUpperTriangle
>
;
using
Block2CTileMap
=
OffsettedBlockToCTileMap
<
typename
GridwiseGemm
::
DefaultBlock2CTileMap
>
;
using
Block2CTileMap
=
OffsettedBlockToCTileMap
<
typename
GridwiseGemm
::
DefaultBlock2CTileMap
>
;
...
@@ -570,16 +432,16 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -570,16 +432,16 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
struct
GroupDeviceArg
struct
GroupDeviceArg
{
{
// problem definiton
// lengths for the last dimensions of overall problem for sanity check of vector load/store
index_t
M
;
std
::
vector
<
index_t
>
raw_lengths_mz_nz_kz_gemm1nz_
;
index_t
N
;
index_t
K
;
index_t
O
;
// Strides for the last dimensions of C for sanity check of vector load/store
// strides for the last dimensions of each tensor for sanity check of vector load/store
index_t
c_extent_lowest_
;
std
::
vector
<
index_t
>
a_mz_kz_strides_
;
index_t
c_stride_lowest_
;
std
::
vector
<
index_t
>
b_nz_kz_strides_
;
std
::
vector
<
index_t
>
b1_nz_kz_strides_
;
std
::
vector
<
index_t
>
c_mz_gemm1nz_strides_
;
// for gridwise gemm check
CGridDesc_M_N
c_grid_desc_m_n_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
};
};
...
@@ -591,6 +453,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -591,6 +453,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
std
::
vector
<
const
void
*>
p_b_vec
,
std
::
vector
<
const
void
*>
p_b_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc0_biases_vec
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc1_biases_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
...
@@ -603,6 +467,7 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -603,6 +467,7 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
b1_element_op_
{
b1_element_op
},
b1_element_op_
{
b1_element_op
},
c_element_op_
{
c_element_op
}
c_element_op_
{
c_element_op
}
{
{
// TODO ANT: implement bias addition
group_count_
=
problem_desc_vec
.
size
();
group_count_
=
problem_desc_vec
.
size
();
if
(
!
(
group_count_
==
p_a_vec
.
size
()
&&
group_count_
==
p_b_vec
.
size
()
&&
if
(
!
(
group_count_
==
p_a_vec
.
size
()
&&
group_count_
==
p_b_vec
.
size
()
&&
...
@@ -611,6 +476,11 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -611,6 +476,11 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
throw
std
::
runtime_error
(
"wrong! group_count_ != a/b/b1/c_vec.size"
);
throw
std
::
runtime_error
(
"wrong! group_count_ != a/b/b1/c_vec.size"
);
}
}
if
(
!
(
p_acc0_biases_vec
.
size
()
==
p_acc1_biases_vec
.
size
()))
{
throw
std
::
runtime_error
(
"wrong! acc0_bias_vec.size != acc1_bias_vec.size"
);
}
grid_size_
=
0
;
grid_size_
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
group_count_
;
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
group_count_
;
i
++
)
...
@@ -620,14 +490,25 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -620,14 +490,25 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
const
auto
p_b1_grid
=
static_cast
<
const
B1DataType
*>
(
p_b1_vec
[
i
]);
const
auto
p_b1_grid
=
static_cast
<
const
B1DataType
*>
(
p_b1_vec
[
i
]);
const
auto
p_c_grid
=
static_cast
<
CDataType
*>
(
p_c_vec
[
i
]);
const
auto
p_c_grid
=
static_cast
<
CDataType
*>
(
p_c_vec
[
i
]);
const
auto
a_grid_desc_ak0_m_ak1
=
DeviceOp
::
MakeAGridDescriptor_AK0_M_AK1
(
const
auto
&
problem_desc
=
problem_desc_vec
[
i
];
problem_desc_vec
[
i
].
M
,
problem_desc_vec
[
i
].
K
,
problem_desc_vec
[
i
].
StrideA
);
const
auto
b_grid_desc_bk0_n_bk1
=
DeviceOp
::
MakeBGridDescriptor_BK0_N_BK1
(
const
auto
a_grid_desc_ak0_m_ak1
=
MakeAGridDescriptor_AK0_M_AK1
(
problem_desc_vec
[
i
].
K
,
problem_desc_vec
[
i
].
N
,
problem_desc_vec
[
i
].
StrideB0
);
problem_desc
.
a_gs_ms_ks_lengths
,
problem_desc
.
a_gs_ms_ks_strides
);
const
auto
b1_grid_desc_bk0_n_bk1
=
DeviceOp
::
MakeB1GridDescriptor_BK0_N_BK1
(
const
auto
b_grid_desc_bk0_n_bk1
=
MakeBGridDescriptor_BK0_N_BK1
(
problem_desc_vec
[
i
].
N
,
problem_desc_vec
[
i
].
O
,
problem_desc_vec
[
i
].
StrideB1
);
problem_desc
.
b0_gs_ns_ks_lengths
,
problem_desc
.
b0_gs_ns_ks_strides
);
const
auto
c_grid_desc_m_n
=
DeviceOp
::
MakeCGridDescriptor_M_N
(
const
auto
b1_grid_desc_bk0_n_bk1
=
MakeB1GridDescriptor_BK0_N_BK1
(
problem_desc_vec
[
i
].
c_gs_ms_os_lengths
,
problem_desc_vec
[
i
].
c_gs_ms_os_strides
);
problem_desc
.
b1_gs_os_ns_lengths
,
problem_desc
.
b1_gs_os_ns_strides
);
const
auto
c_grid_desc_m_n
=
Transform
::
MakeCGridDescriptor_M_N
(
problem_desc
.
c_gs_ms_os_lengths
,
problem_desc
.
c_gs_ms_os_strides
);
const
auto
a_grid_desc_g_m_k
=
Transform
::
MakeAGridDescriptor_G_M_K
(
problem_desc
.
a_gs_ms_ks_lengths
,
problem_desc
.
a_gs_ms_ks_strides
);
const
auto
b_grid_desc_g_n_k
=
Transform
::
MakeB0GridDescriptor_G_N_K
(
problem_desc
.
b0_gs_ns_ks_lengths
,
problem_desc
.
b0_gs_ns_ks_strides
);
const
auto
b1_grid_desc_g_n_k
=
Transform
::
MakeB1GridDescriptor_G_N_K
(
problem_desc
.
b1_gs_os_ns_lengths
,
problem_desc
.
b1_gs_os_ns_strides
);
const
auto
c_grid_desc_g_m_n
=
Transform
::
MakeCGridDescriptor_G_M_N
(
problem_desc
.
c_gs_ms_os_lengths
,
problem_desc
.
c_gs_ms_os_strides
);
const
auto
c_grid_desc_mblock_mperblock_nblock_nperblock
=
const
auto
c_grid_desc_mblock_mperblock_nblock_nperblock
=
GridwiseGemm
::
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
GridwiseGemm
::
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
...
@@ -635,25 +516,32 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -635,25 +516,32 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
const
index_t
BlockStart
=
grid_size_
;
const
index_t
BlockStart
=
grid_size_
;
const
auto
block_2_ctile_map
=
Block2CTileMap
(
c_grid_desc_m_n
,
BlockStart
);
const
auto
block_2_ctile_map
=
Block2CTileMap
(
c_grid_desc_m_n
,
BlockStart
);
const
index_t
grid_size_grp
=
block_2_ctile_map
.
CalculateGridSize
(
c_grid_desc_m_n
)
*
const
index_t
batch_count
=
c_grid_desc_g_m_n
.
GetLength
(
I0
);
problem_desc_vec
[
i
].
Batch
;
const
index_t
grid_size_grp
=
block_2_ctile_map
.
CalculateGridSize
(
c_grid_desc_m_n
)
*
batch_count
;
const
index_t
BlockEnd
=
grid_size_
+
grid_size_grp
;
const
index_t
BlockEnd
=
grid_size_
+
grid_size_grp
;
// batch stride
// batch stride
// TODO ANT: only keep batch stride in tensor desc to reduce scalar cache pressure
const
auto
compute_base_ptr_of_batch
=
ComputeBasePtrOfStridedBatch
(
const
auto
c_grid_desc_g_m_n
=
DeviceOp
::
MakeCGridDescriptor_G_M_N
(
a_grid_desc_g_m_k
,
b_grid_desc_g_n_k
,
b1_grid_desc_g_n_k
,
c_grid_desc_g_m_n
);
problem_desc_vec
[
i
].
c_gs_ms_os_lengths
,
problem_desc_vec
[
i
].
c_gs_ms_os_strides
);
const
auto
compute_base_ptr_of_batch
=
ComputeBasePtrOfStridedBatch
(
problem_desc_vec
[
i
].
BatchStrideA
,
problem_desc_vec
[
i
].
BatchStrideB0
,
problem_desc_vec
[
i
].
BatchStrideB1
,
c_grid_desc_g_m_n
);
// C0 mask
// C0 mask
const
auto
c0_matrix_mask
=
C0MatrixMask
(
problem_desc_vec
[
i
].
N
);
const
auto
c0_matrix_mask
=
C0MatrixMask
(
b_grid_desc_g_n_k
.
GetLength
(
I1
)
);
grid_size_
+=
grid_size_grp
;
grid_size_
+=
grid_size_grp
;
// for each group, make sure acc0_biases_gs_ms_ns_lengths.size() == NumAcc0Bias and
// so on
if
(
!
(
problem_desc
.
acc0_biases_gs_ms_ns_lengths
.
size
()
==
NumAcc0Bias
&&
problem_desc
.
acc0_biases_gs_ms_ns_strides
.
size
()
==
NumAcc0Bias
&&
problem_desc
.
acc1_biases_gs_ms_os_lengths
.
size
()
==
NumAcc1Bias
&&
problem_desc
.
acc1_biases_gs_ms_os_strides
.
size
()
==
NumAcc1Bias
))
{
throw
std
::
runtime_error
(
"wrong! number of biases in function argument does not "
"match that in template argument"
);
}
group_kernel_args_
.
push_back
({
p_a_grid
,
group_kernel_args_
.
push_back
({
p_a_grid
,
p_b_grid
,
p_b_grid
,
p_b1_grid
,
p_b1_grid
,
...
@@ -669,13 +557,20 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -669,13 +557,20 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
BlockStart
,
BlockStart
,
BlockEnd
});
BlockEnd
});
group_device_args_
.
push_back
({
problem_desc_vec
[
i
].
M
,
group_device_args_
.
push_back
(
problem_desc_vec
[
i
].
N
,
{{
problem_desc
.
a_gs_ms_ks_lengths
[
NumDimG
+
NumDimM
-
1
],
problem_desc_vec
[
i
].
K
,
problem_desc
.
b0_gs_ns_ks_lengths
[
NumDimG
+
NumDimN
-
1
],
problem_desc_vec
[
i
].
O
,
problem_desc
.
b0_gs_ns_ks_lengths
[
NumDimG
+
NumDimN
+
NumDimK
-
1
],
problem_desc_vec
[
i
].
c_gs_ms_os_lengths
.
back
(),
problem_desc
.
b1_gs_os_ns_lengths
[
NumDimG
+
NumDimO
-
1
]},
problem_desc_vec
[
i
].
c_gs_ms_os_strides
.
back
(),
{
problem_desc
.
a_gs_ms_ks_strides
[
NumDimG
+
NumDimM
-
1
],
c_grid_desc_m_n
});
problem_desc
.
a_gs_ms_ks_strides
[
NumDimG
+
NumDimM
+
NumDimK
-
1
]},
{
problem_desc
.
b0_gs_ns_ks_strides
[
NumDimG
+
NumDimN
-
1
],
problem_desc
.
b0_gs_ns_ks_strides
[
NumDimG
+
NumDimN
+
NumDimK
-
1
]},
{
problem_desc
.
b1_gs_os_ns_strides
[
NumDimG
+
NumDimO
-
1
],
problem_desc
.
b1_gs_os_ns_strides
[
NumDimG
+
NumDimO
+
NumDimN
-
1
]},
{
problem_desc
.
c_gs_ms_os_strides
[
NumDimG
+
NumDimM
-
1
],
problem_desc
.
c_gs_ms_os_strides
[
NumDimG
+
NumDimM
+
NumDimO
-
1
]},
c_grid_desc_m_n
});
}
}
}
}
...
@@ -788,6 +683,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -788,6 +683,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
return
false
;
return
false
;
}
}
// TODO ANT: Check if tensor specialization & strides mismatch
bool
all_has_main_k_block_loop
=
true
;
bool
all_has_main_k_block_loop
=
true
;
bool
some_has_main_k_block_loop
=
false
;
bool
some_has_main_k_block_loop
=
false
;
...
@@ -815,19 +712,16 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -815,19 +712,16 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
// Note: we need raw lengths since threadwise copy can not handle vector load when
// Note: we need raw lengths since threadwise copy can not handle vector load when
// part of vector is out of bounds
// part of vector is out of bounds
const
auto
MRaw
=
device_arg
.
M
;
const
auto
M
z
Raw
=
device_arg
.
raw_lengths_mz_nz_kz_gemm1nz_
[
0
]
;
const
auto
NRaw
=
device_arg
.
N
;
const
auto
N
z
Raw
=
device_arg
.
raw_lengths_mz_nz_kz_gemm1nz_
[
1
]
;
const
auto
KRaw
=
device_arg
.
K
;
const
auto
K
z
Raw
=
device_arg
.
raw_lengths_mz_nz_kz_gemm1nz_
[
2
]
;
const
auto
Gemm1NRaw
=
device_arg
.
O
;
const
auto
Gemm1N
z
Raw
=
device_arg
.
raw_lengths_mz_nz_kz_gemm1nz_
[
3
]
;
// Check scalar per vector requirement
// Check scalar per vector requirement
const
auto
a_extent_lowest
=
const
auto
a_extent_lowest
=
ABlockTransferSrcVectorDim
==
2
?
KzRaw
:
MzRaw
;
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>
?
KRaw
:
MRaw
;
const
auto
b_extent_lowest
=
BBlockTransferSrcVectorDim
==
2
?
KzRaw
:
NzRaw
;
const
auto
b_extent_lowest
=
const
auto
b1_extent_lowest
=
B1BlockTransferSrcVectorDim
==
2
?
NzRaw
:
Gemm1NzRaw
;
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>
?
NRaw
:
KRaw
;
const
auto
c_extent_lowest
=
Gemm1NzRaw
;
const
auto
b1_extent_lowest
=
is_same_v
<
tensor_layout
::
gemm
::
RowMajor
,
B1Layout
>
?
Gemm1NRaw
:
NRaw
;
const
auto
c_extent_lowest
=
device_arg
.
c_extent_lowest_
;
if
(
!
(
a_extent_lowest
%
ABlockTransferSrcScalarPerVector
==
0
&&
if
(
!
(
a_extent_lowest
%
ABlockTransferSrcScalarPerVector
==
0
&&
b_extent_lowest
%
BBlockTransferSrcScalarPerVector
==
0
&&
b_extent_lowest
%
BBlockTransferSrcScalarPerVector
==
0
&&
...
@@ -837,8 +731,22 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -837,8 +731,22 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
return
false
;
return
false
;
}
}
// Check vector store requirement; assumes last dimension in N to be contiguous
// Check vector load/store requirement
if
(
device_arg
.
c_stride_lowest_
!=
1
)
const
auto
a_stride_lowest
=
ABlockTransferSrcVectorDim
==
2
?
device_arg
.
a_mz_kz_strides_
[
1
]
:
device_arg
.
a_mz_kz_strides_
[
0
];
const
auto
b_stride_lowest
=
BBlockTransferSrcVectorDim
==
2
?
device_arg
.
b_nz_kz_strides_
[
1
]
:
device_arg
.
b_nz_kz_strides_
[
0
];
const
auto
b1_stride_lowest
=
B1BlockTransferSrcVectorDim
==
2
?
device_arg
.
b1_nz_kz_strides_
[
1
]
:
device_arg
.
b1_nz_kz_strides_
[
0
];
const
auto
c_stride_lowest
=
device_arg
.
c_mz_gemm1nz_strides_
[
1
];
// cshuffle assumes lowest dim in Gemm1Ns to be
// contiguous
if
(
!
(
a_stride_lowest
==
1
||
b_stride_lowest
==
1
||
b1_stride_lowest
==
1
||
c_stride_lowest
==
1
))
{
{
return
false
;
return
false
;
}
}
...
@@ -873,6 +781,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -873,6 +781,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
std
::
vector
<
const
void
*>
p_b_vec
,
std
::
vector
<
const
void
*>
p_b_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc0_biases_vec
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc1_biases_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
...
@@ -884,6 +794,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -884,6 +794,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
p_b_vec
,
p_b_vec
,
p_b1_vec
,
p_b1_vec
,
p_c_vec
,
p_c_vec
,
p_acc0_biases_vec
,
p_acc1_biases_vec
,
problem_desc_vec
,
problem_desc_vec
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
...
@@ -895,21 +807,26 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -895,21 +807,26 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
// polymorphic
// polymorphic
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
std
::
vector
<
const
void
*>
p_a_vec
,
std
::
unique_ptr
<
BaseArgument
>
std
::
vector
<
const
void
*>
p_b_vec
,
MakeArgumentPointer
(
std
::
vector
<
const
void
*>
p_a_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
const
void
*>
p_b_vec
,
std
::
vector
<
void
*>
p_c_vec
,
std
::
vector
<
const
void
*>
p_b1_vec
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
std
::
vector
<
void
*>
p_c_vec
,
AElementwiseOperation
a_element_op
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc0_biases_vec
,
BElementwiseOperation
b_element_op
,
std
::
vector
<
std
::
vector
<
const
void
*>>
p_acc1_biases_vec
,
AccElementwiseOperation
acc_element_op
,
std
::
vector
<
ProblemDesc
>
problem_desc_vec
,
B1ElementwiseOperation
b1_element_op
,
AElementwiseOperation
a_element_op
,
CElementwiseOperation
c_element_op
)
override
BElementwiseOperation
b_element_op
,
AccElementwiseOperation
acc_element_op
,
B1ElementwiseOperation
b1_element_op
,
CElementwiseOperation
c_element_op
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
p_a_vec
,
return
std
::
make_unique
<
Argument
>
(
p_a_vec
,
p_b_vec
,
p_b_vec
,
p_b1_vec
,
p_b1_vec
,
p_c_vec
,
p_c_vec
,
p_acc0_biases_vec
,
p_acc1_biases_vec
,
problem_desc_vec
,
problem_desc_vec
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
...
@@ -942,7 +859,12 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
...
@@ -942,7 +859,12 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
<<
Gemm1NPerBlock
<<
", "
<<
Gemm1NPerBlock
<<
", "
<<
Gemm1KPerBlock
<<
", "
<<
Gemm1KPerBlock
<<
", "
<<
B1K1
<<
", "
<<
B1K1
<<
", "
<<
getGemmSpecializationString
(
GemmSpec
)
<<
">"
;
<<
getGemmSpecializationString
(
GemmSpec
)
<<
", "
<<
"ASpec"
<<
getTensorSpecializationString
(
ASpec
)
<<
", "
<<
"B0Spec"
<<
getTensorSpecializationString
(
BSpec
)
<<
", "
<<
"B1Spec"
<<
getTensorSpecializationString
(
B1Spec
)
<<
", "
<<
"CSpec"
<<
getTensorSpecializationString
(
CSpec
)
<<
", "
<<
getMaskingSpecializationString
(
MaskingSpec
)
<<
">"
;
// clang-format on
// clang-format on
return
str
.
str
();
return
str
.
str
();
...
...
include/ck/tensor_operation/gpu/device/device_normalization.hpp
View file @
24af0144
...
@@ -11,33 +11,6 @@
...
@@ -11,33 +11,6 @@
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
struct
DeviceNormalization
:
public
BaseOperator
{
// inLengths: input tensor extent(s) from high to low dimension
// inStrides: input tensor stride(s) from high to low dimension
// reduceDims: the dimension(s) the normalization operation is applied
// alpha: typeless pointer in host memory storing the alpha scaling value of type AccDataType
// beta: typeless pointer in host memory storing the beta scaling value of type AccDataType
// in_dev: typeless const pointer in device memory storing the input tensor
// out_dev: typeless pointer in device memory storing the output tensor
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
void
*
alpha
,
const
void
*
beta
,
const
void
*
in_dev
,
void
*
out_dev
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
index_t
GetRank
()
const
=
0
;
virtual
index_t
GetNumReduceDim
()
const
=
0
;
};
using
DeviceNormalizationPtr
=
std
::
unique_ptr
<
DeviceNormalization
>
;
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
...
@@ -46,7 +19,7 @@ template <typename XDataType,
...
@@ -46,7 +19,7 @@ template <typename XDataType,
typename
AccElementwiseOperation
,
typename
AccElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
>
index_t
NumReduceDim
>
struct
Device
Layernorm
:
public
BaseOperator
struct
Device
Normalization
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
...
@@ -60,6 +33,8 @@ struct DeviceLayernorm : public BaseOperator
...
@@ -60,6 +33,8 @@ struct DeviceLayernorm : public BaseOperator
const
void
*
p_gamma
,
const
void
*
p_gamma
,
const
void
*
p_beta
,
const
void
*
p_beta
,
void
*
p_y
,
void
*
p_y
,
void
*
p_savedMean
,
void
*
p_savedInvVar
,
AccElementwiseOperation
acc_elementwise_op
)
=
0
;
AccElementwiseOperation
acc_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
...
@@ -73,14 +48,14 @@ template <typename XDataType,
...
@@ -73,14 +48,14 @@ template <typename XDataType,
typename
AccElementwiseOperation
,
typename
AccElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
>
index_t
NumReduceDim
>
using
Device
Layernorm
Ptr
=
std
::
unique_ptr
<
Device
Layernorm
<
XDataType
,
using
Device
Normalization
Ptr
=
std
::
unique_ptr
<
Device
Normalization
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
AccDataType
,
AccDataType
,
YDataType
,
YDataType
,
AccElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
>>
;
NumReduceDim
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
41
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment