Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ac580f77
Commit
ac580f77
authored
Jun 16, 2023
by
Alan Turner
Browse files
Merge remote-tracking branch 'origin/develop' into migx-jit-lib
parents
707d6261
027e46ee
Changes
146
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1830 additions
and
21 deletions
+1830
-21
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+16
-7
library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
...ry/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
+337
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
...or_operation_instance/gpu/grouped_convolution_forward.hpp
+14
-14
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
...peration_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
+18
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
...ched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+95
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
...multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
+84
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
...ched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+95
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
...multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
+83
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
...ched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+95
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
...multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
+83
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
...ched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+95
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
...multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
+83
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
...batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
+93
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
...mm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
+90
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
...batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
+93
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
...mm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
+90
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
...batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
+93
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
...mm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
+90
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
...batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
+93
-0
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
...mm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
+90
-0
No files found.
include/ck/utility/dynamic_buffer.hpp
View file @
ac580f77
...
@@ -19,7 +19,8 @@ namespace ck {
...
@@ -19,7 +19,8 @@ namespace ck {
template
<
AddressSpaceEnum
BufferAddressSpace
,
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
T
,
typename
ElementSpaceSize
,
typename
ElementSpaceSize
,
bool
InvalidElementUseNumericalZeroValue
>
bool
InvalidElementUseNumericalZeroValue
,
AmdBufferCoherenceEnum
coherence
=
AmdBufferCoherenceEnum
::
DefaultCoherence
>
struct
DynamicBuffer
struct
DynamicBuffer
{
{
using
type
=
T
;
using
type
=
T
;
...
@@ -77,13 +78,16 @@ struct DynamicBuffer
...
@@ -77,13 +78,16 @@ struct DynamicBuffer
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
if
constexpr
(
InvalidElementUseNumericalZeroValue
)
{
{
return
amd_buffer_load_invalid_element_return_zero
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
return
amd_buffer_load_invalid_element_return_zero
<
remove_cvref_t
<
T
>
,
t_per_x
,
coherence
>
(
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
}
else
else
{
{
return
amd_buffer_load_invalid_element_return_customized_value
<
remove_cvref_t
<
T
>
,
return
amd_buffer_load_invalid_element_return_customized_value
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
t_per_x
,
coherence
>
(
p_data_
,
i
,
is_valid_element
,
element_space_size_
,
invalid_element_value_
);
p_data_
,
i
,
is_valid_element
,
element_space_size_
,
invalid_element_value_
);
}
}
}
}
...
@@ -173,7 +177,7 @@ struct DynamicBuffer
...
@@ -173,7 +177,7 @@ struct DynamicBuffer
{
{
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
constexpr
index_t
t_per_x
=
scalar_per_x_vector
/
scalar_per_t_vector
;
amd_buffer_store
<
remove_cvref_t
<
T
>
,
t_per_x
>
(
amd_buffer_store
<
remove_cvref_t
<
T
>
,
t_per_x
,
coherence
>
(
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
x
,
p_data_
,
i
,
is_valid_element
,
element_space_size_
);
}
}
else
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Lds
&&
else
if
constexpr
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Lds
&&
...
@@ -376,14 +380,19 @@ struct DynamicBuffer
...
@@ -376,14 +380,19 @@ struct DynamicBuffer
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
};
};
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
template
<
AddressSpaceEnum
BufferAddressSpace
,
AmdBufferCoherenceEnum
coherence
=
AmdBufferCoherenceEnum
::
DefaultCoherence
,
typename
T
,
typename
ElementSpaceSize
>
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
)
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
)
{
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
>
{
p
,
element_space_size
};
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
,
coherence
>
{
p
,
element_space_size
};
}
}
template
<
template
<
AddressSpaceEnum
BufferAddressSpace
,
AddressSpaceEnum
BufferAddressSpace
,
AmdBufferCoherenceEnum
coherence
=
AmdBufferCoherenceEnum
::
DefaultCoherence
,
typename
T
,
typename
T
,
typename
ElementSpaceSize
,
typename
ElementSpaceSize
,
typename
X
,
typename
X
,
...
@@ -391,7 +400,7 @@ template <
...
@@ -391,7 +400,7 @@ template <
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
,
X
invalid_element_value
)
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
,
X
invalid_element_value
)
{
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
false
>
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
false
,
coherence
>
{
p
,
element_space_size
,
invalid_element_value
};
p
,
element_space_size
,
invalid_element_value
};
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
template
<
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmMultiD
<
ALayout
,
BLayout
,
Empty_Tuple
,
ELayout
,
ADataType
,
BDataType
,
Empty_Tuple
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>>
{
using
DeviceOp
=
DeviceBatchedGemmMultiD
<
ALayout
,
BLayout
,
Empty_Tuple
,
ELayout
,
ADataType
,
BDataType
,
Empty_Tuple
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
int8_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
EDataType
,
int8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances
(
op_ptrs
);
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
View file @
ac580f77
...
@@ -245,11 +245,11 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
...
@@ -245,11 +245,11 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
// grouped conv3d forward, NDHWGC/KZYX
G
C/NDHWGK
// grouped conv3d forward, NDHWGC/
G
KZYXC/NDHWGK
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_bf16_instances
(
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
NDHWGC
,
KZYX
G
C
,
G
KZYXC
,
Empty_Tuple
,
Empty_Tuple
,
NDHWGK
,
NDHWGK
,
BF16
,
BF16
,
...
@@ -260,10 +260,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
...
@@ -260,10 +260,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_f16_instances
(
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
NDHWGC
,
KZYX
G
C
,
G
KZYXC
,
Empty_Tuple
,
Empty_Tuple
,
NDHWGK
,
NDHWGK
,
F16
,
F16
,
...
@@ -274,10 +274,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
...
@@ -274,10 +274,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_f32_instances
(
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
NDHWGC
,
KZYX
G
C
,
G
KZYXC
,
Empty_Tuple
,
Empty_Tuple
,
NDHWGK
,
NDHWGK
,
F32
,
F32
,
...
@@ -288,10 +288,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
...
@@ -288,10 +288,10 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_int8_instances
(
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
NDHWGC
,
KZYX
G
C
,
G
KZYXC
,
Empty_Tuple
,
Empty_Tuple
,
NDHWGK
,
NDHWGK
,
int8_t
,
int8_t
,
...
@@ -433,28 +433,28 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
...
@@ -433,28 +433,28 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
}
}
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWGC
>
&&
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
KZYX
G
C
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
is_same_v
<
WeiLayout
,
G
KZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
{
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
)
{
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_f32_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_f32_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
)
{
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_f16_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_bf16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
)
{
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyx
g
c_ndhwgk_int8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_xdl_ndhwgc_
g
kzyxc_ndhwgk_int8_instances
(
op_ptrs
);
}
}
}
}
...
...
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt
0 → 100644
View file @
ac580f77
add_instance_library
(
device_batched_gemm_multi_d_instance
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
)
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
8
,
2
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// // MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
64
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// // MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
64
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
2
,
2
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
2
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
2
,
2
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
4
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
4
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
4
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
4
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
4
,
4
,
1
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
4
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
4
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
4
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
4
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
4
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
4
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
4
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
4
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
4
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
4
,
4
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
2
,
4
,
1
,
S
<
4
,
2
>
,
S
<
2
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
4
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
4
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
4
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
4
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
4
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
4
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
4
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
4
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
4
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
4
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
4
,
4
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
2
,
4
,
1
,
S
<
4
,
2
>
,
S
<
2
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
4
,
2
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
4
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
16
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
4
,
4
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
8
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
2
,
4
,
1
,
S
<
4
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
2
,
1
,
4
,
4
>
,
S
<
4
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
32
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
4
,
1
,
2
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
128
,
16
,
4
,
4
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// // MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// // MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=16, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
16
,
64
,
16
,
2
,
1
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
,
// MPerBlock=64, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
16
,
16
,
2
,
4
,
1
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
4
,
1
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
2
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
0 → 100644
View file @
ac580f77
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
using
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
=
std
::
tuple
<
// clang-format off
// ##########################| ALayout| BLayout| DsLayout| CLayout| AData| BData| AccData| DsData| CData| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ##########################| | | | | Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ##########################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| Order| | |
// ##########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// MPerBlock=128, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
8
,
2
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
4
,
4
>
,
S
<
4
,
2
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
128
,
128
,
128
,
16
,
4
,
4
,
8
,
1
,
S
<
2
,
8
>
,
S
<
2
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
2
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// // MPerBlock=128, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
128
,
64
,
16
,
4
,
4
,
2
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// // MPerBlock=64, NPerBlock=128
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
4
,
4
>
,
S
<
4
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
256
,
64
,
128
,
16
,
4
,
2
,
4
,
1
,
S
<
2
,
8
>
,
S
<
2
,
8
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
64
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
2
,
4
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
8
,
1
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
64
,
8
,
4
,
4
,
4
,
1
,
S
<
4
,
2
>
,
S
<
8
,
1
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
32
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=32, NPerBlock=32
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
2
,
4
,
1
,
S
<
4
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
4
,
2
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
32
,
32
,
32
,
8
,
4
,
4
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
4
,
1
,
2
,
4
>
,
S
<
2
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=16, NPerBlock=16
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
2
,
2
>
,
S
<
2
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
16
,
16
,
16
,
16
,
2
,
2
,
2
,
1
,
S
<
4
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
4
,
2
>
,
S
<
4
,
1
,
4
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=64
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
8
,
64
,
32
,
2
,
1
,
2
,
1
,
S
<
2
,
2
>
,
S
<
8
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=64, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
64
,
64
,
8
,
32
,
2
,
2
,
1
,
1
,
S
<
8
,
2
>
,
S
<
2
,
2
>
,
S
<
8
,
1
,
4
,
2
>
,
S
<
4
,
1
,
16
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
8
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
// MPerBlock=8, NPerBlock=8
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
4
,
1
>
,
S
<
2
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
1
,
2
,
1
,
S
<
1
,
4
>
,
S
<
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
2
,
1
>
,
S
<
4
,
1
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
,
DeviceBatchedGemmMultipleD_Dl
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
int32_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmMNPadding
,
8
,
8
,
8
,
4
,
2
,
2
,
1
,
1
,
S
<
1
,
2
>
,
S
<
1
,
4
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
1
,
8
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
1
>
// clang-format on
>
;
void
add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment