Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0403be0e
"test/vscode:/vscode.git/clone" did not exist on "c77c1e05badb5f5bf774872c3498b21eeb0aef20"
Unverified
Commit
0403be0e
authored
Nov 28, 2023
by
M.Emin Ozturk
Committed by
GitHub
Nov 28, 2023
Browse files
Merge branch 'develop' into muozturk/complex_contraction_bilinear_test
parents
0d646eff
ae5e5181
Changes
40
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
388 additions
and
216 deletions
+388
-216
client_example/18_groupnorm/CMakeLists.txt
client_example/18_groupnorm/CMakeLists.txt
+1
-1
client_example/19_pool/CMakeLists.txt
client_example/19_pool/CMakeLists.txt
+4
-4
client_example/20_splitk_gemm/CMakeLists.txt
client_example/20_splitk_gemm/CMakeLists.txt
+1
-1
client_example/21_grouped_gemm_bias/CMakeLists.txt
client_example/21_grouped_gemm_bias/CMakeLists.txt
+1
-1
client_example/22_grouped_gemm/CMakeLists.txt
client_example/22_grouped_gemm/CMakeLists.txt
+3
-3
client_example/22_im2col_col2im/CMakeLists.txt
client_example/22_im2col_col2im/CMakeLists.txt
+2
-2
client_example/23_elementwise_transpose/CMakeLists.txt
client_example/23_elementwise_transpose/CMakeLists.txt
+1
-1
client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/CMakeLists.txt
..._grouped_convnd_fwd_scaleadd_scaleadd_relu/CMakeLists.txt
+4
-4
client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
..._example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
+4
-4
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+1
-1
example/27_layernorm2d_fwd/run_layernorm_example.inc
example/27_layernorm2d_fwd/run_layernorm_example.inc
+2
-2
example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+2
-2
example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+2
-2
include/ck/ck.hpp
include/ck/ck.hpp
+3
-0
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+2
-2
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+18
-0
include/ck/utility/type_convert.hpp
include/ck/utility/type_convert.hpp
+189
-141
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+147
-42
profiler/include/profiler/profile_gemm_impl.hpp
profiler/include/profiler/profile_gemm_impl.hpp
+1
-1
test/transpose/test_transpose_ut_cases.inc
test/transpose/test_transpose_ut_cases.inc
+0
-2
No files found.
client_example/18_groupnorm/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_groupnorm_swish groupnorm_swish.cpp
)
add_executable
(
client_groupnorm_swish groupnorm_swish.cpp
)
target_link_libraries
(
client_groupnorm_swish PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_groupnorm_swish PRIVATE composable_kernel::device_
other_
operations
)
client_example/19_pool/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_max_pool2d_fwd max_pool2d_fwd.cpp
)
add_executable
(
client_max_pool2d_fwd max_pool2d_fwd.cpp
)
target_link_libraries
(
client_max_pool2d_fwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_max_pool2d_fwd PRIVATE composable_kernel::device_
other_
operations
)
add_executable
(
client_max_pool2d_bwd max_pool2d_bwd.cpp
)
add_executable
(
client_max_pool2d_bwd max_pool2d_bwd.cpp
)
target_link_libraries
(
client_max_pool2d_bwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_max_pool2d_bwd PRIVATE composable_kernel::device_
other_
operations
)
add_executable
(
client_avg_pool3d_fwd avg_pool3d_fwd.cpp
)
add_executable
(
client_avg_pool3d_fwd avg_pool3d_fwd.cpp
)
target_link_libraries
(
client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_avg_pool3d_fwd PRIVATE composable_kernel::device_
other_
operations
)
add_executable
(
client_avg_pool3d_bwd avg_pool3d_bwd.cpp
)
add_executable
(
client_avg_pool3d_bwd avg_pool3d_bwd.cpp
)
target_link_libraries
(
client_avg_pool3d_bwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_avg_pool3d_bwd PRIVATE composable_kernel::device_
other_
operations
)
client_example/20_splitk_gemm/CMakeLists.txt
View file @
0403be0e
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"fp16"
)
OR NOT DEFINED DTYPES
)
add_executable
(
client_splitK_gemm splitK_gemm_fp16_f8.cpp
)
add_executable
(
client_splitK_gemm splitK_gemm_fp16_f8.cpp
)
target_link_libraries
(
client_splitK_gemm PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_splitK_gemm PRIVATE composable_kernel::device_
gemm_
operations
)
endif
()
endif
()
client_example/21_grouped_gemm_bias/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp
)
add_executable
(
client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_
gemm_
operations
)
client_example/22_grouped_gemm/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_grouped_gemm_fixed_nk_fp16 grouped_gemm_fixed_nk_fp16.cpp
)
add_executable
(
client_grouped_gemm_fixed_nk_fp16 grouped_gemm_fixed_nk_fp16.cpp
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_fp16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_fp16 PRIVATE composable_kernel::device_
gemm_
operations
)
add_executable
(
client_grouped_gemm_fixed_nk_fp8 grouped_gemm_fixed_nk_fp8.cpp
)
add_executable
(
client_grouped_gemm_fixed_nk_fp8 grouped_gemm_fixed_nk_fp8.cpp
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_fp8 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_fp8 PRIVATE composable_kernel::device_
gemm_
operations
)
add_executable
(
client_grouped_gemm_fixed_nk_i8 grouped_gemm_fixed_nk_i8.cpp
)
add_executable
(
client_grouped_gemm_fixed_nk_i8 grouped_gemm_fixed_nk_i8.cpp
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_i8 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_gemm_fixed_nk_i8 PRIVATE composable_kernel::device_
gemm_
operations
)
client_example/22_im2col_col2im/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_image_to_column image_to_column.cpp
)
add_executable
(
client_image_to_column image_to_column.cpp
)
target_link_libraries
(
client_image_to_column PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_image_to_column PRIVATE composable_kernel::device_
other_
operations
)
add_executable
(
client_column_to_image column_to_image.cpp
)
add_executable
(
client_column_to_image column_to_image.cpp
)
target_link_libraries
(
client_column_to_image PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_column_to_image PRIVATE composable_kernel::device_
other_
operations
)
client_example/23_elementwise_transpose/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_elementwise_transpose3d elementwise_transpose_3d.cpp
)
add_executable
(
client_elementwise_transpose3d elementwise_transpose_3d.cpp
)
target_link_libraries
(
client_elementwise_transpose3d PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_elementwise_transpose3d PRIVATE composable_kernel::device_
other_
operations
)
client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_
conv_
operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_
conv_
operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_
conv_
operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_
conv_
operations
)
client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
View file @
0403be0e
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_fp32 grouped_conv_fwd_scaleadd_ab_fp32.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_fp32 grouped_conv_fwd_scaleadd_ab_fp32.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_
conv_
operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_fp16 grouped_conv_fwd_scaleadd_ab_fp16.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_fp16 grouped_conv_fwd_scaleadd_ab_fp16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_
conv_
operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_bf16 grouped_conv_fwd_scaleadd_ab_bf16.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_bf16 grouped_conv_fwd_scaleadd_ab_bf16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_
conv_
operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_int8 grouped_conv_fwd_scaleadd_ab_int8.cpp
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_int8 grouped_conv_fwd_scaleadd_ab_int8.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_
conv_
operations
)
client_example/CMakeLists.txt
View file @
0403be0e
...
@@ -48,7 +48,7 @@ else()
...
@@ -48,7 +48,7 @@ else()
endif
()
endif
()
endif
()
endif
()
find_package
(
composable_kernel COMPONENTS device_operations
)
find_package
(
composable_kernel COMPONENTS device_
other_operations device_gemm_operations device_conv_operations device_contraction_operations device_reduction_
operations
)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
...
...
example/27_layernorm2d_fwd/run_layernorm_example.inc
View file @
0403be0e
example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
View file @
0403be0e
example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
View file @
0403be0e
include/ck/ck.hpp
View file @
0403be0e
...
@@ -134,6 +134,9 @@
...
@@ -134,6 +134,9 @@
// inner product using V_DOT with DPP8 modifiers
// inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// set stochastic rounding as default for f8 conversions
#define CK_USE_SR_F8_CONVERSION 1
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
...
...
include/ck/stream_config.hpp
View file @
0403be0e
...
@@ -11,6 +11,6 @@ struct StreamConfig
...
@@ -11,6 +11,6 @@ struct StreamConfig
hipStream_t
stream_id_
=
nullptr
;
hipStream_t
stream_id_
=
nullptr
;
bool
time_kernel_
=
false
;
bool
time_kernel_
=
false
;
int
log_level_
=
0
;
int
log_level_
=
0
;
int
cold_niters_
=
50
;
int
cold_niters_
=
1
;
int
nrepeat_
=
20
0
;
int
nrepeat_
=
1
0
;
};
};
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
0403be0e
...
@@ -281,6 +281,24 @@ struct ConvertF8SR
...
@@ -281,6 +281,24 @@ struct ConvertF8SR
}
}
};
};
struct
ConvertF8RNE
{
// convert to fp8 using rounding to nearest even
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
{
// check Y datatype
static_assert
(
is_same
<
Y
,
f8_t
>::
value
||
is_same
<
Y
,
bf8_t
>::
value
,
"Data type is not supported by this operation!"
);
// check X datatype
static_assert
(
is_same
<
X
,
float
>::
value
||
is_same
<
X
,
half_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
f8_convert_rne
<
Y
>
(
x
);
}
};
struct
Scale
struct
Scale
{
{
__host__
__device__
Scale
(
float
scale
)
:
scale_
(
scale
)
{}
__host__
__device__
Scale
(
float
scale
)
:
scale_
(
scale
)
{}
...
...
include/ck/utility/type_convert.hpp
View file @
0403be0e
...
@@ -95,9 +95,113 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
...
@@ -95,9 +95,113 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
return
type_convert
<
bhalf_t
>
(
x_fp32
);
return
type_convert
<
bhalf_t
>
(
x_fp32
);
}
}
// convert fp32 to fp8
// Declare a template function for fp8 conversion using SR
template
<
typename
Y
,
typename
X
>
__host__
__device__
constexpr
Y
f8_convert_sr
(
X
x
);
// convert fp32 to fp8 with stochastic rounding
template
<
>
template
<
>
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
float
>
(
float
x
)
inline
__host__
__device__
f8_t
f8_convert_sr
<
f8_t
,
float
>
(
float
x
)
{
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
float
max_fp8
=
240.0
f
;
x
=
x
>
max_fp8
?
max_fp8
:
(
x
<
-
max_fp8
?
-
max_fp8
:
x
);
union
{
float
fval
;
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_sr_fp8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
// little endian
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
return
utils
::
cast_to_f8
<
float
,
f8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp16 to fp8 with stochastic rounding
template
<
>
inline
__host__
__device__
f8_t
f8_convert_sr
<
f8_t
,
half_t
>
(
half_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// convert to float and use native converion
return
f8_convert_sr
<
f8_t
>
(
type_convert
<
float
>
(
x
));
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
half_t
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
return
utils
::
cast_to_f8
<
half_t
,
f8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp32 to bf8 with stochastic rounding
template
<
>
inline
__host__
__device__
bf8_t
f8_convert_sr
<
bf8_t
,
float
>
(
float
x
)
{
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
union
{
float
fval
;
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_sr_bf8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
// little endian
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
return
utils
::
cast_to_f8
<
float
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp16 to bf8 with stochastic rounding
template
<
>
inline
__host__
__device__
bf8_t
f8_convert_sr
<
bf8_t
,
half_t
>
(
half_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// convert to float and use native converion
return
f8_convert_sr
<
f8_t
>
(
type_convert
<
float
>
(
x
));
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
half_t
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
return
utils
::
cast_to_f8
<
half_t
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// Declare a template function for fp8 conversion using RNE
template
<
typename
Y
,
typename
X
>
__host__
__device__
constexpr
Y
f8_convert_rne
(
X
x
);
// convert fp32 to fp8 with rounding to nearest even
template
<
>
inline
__host__
__device__
f8_t
f8_convert_rne
<
f8_t
,
float
>
(
float
x
)
{
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
float
max_fp8
=
240.0
f
;
float
max_fp8
=
240.0
f
;
...
@@ -124,6 +228,80 @@ inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
...
@@ -124,6 +228,80 @@ inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
#endif
#endif
}
}
// convert fp16 to fp8 with rounding to nearest even
template
<
>
inline
__host__
__device__
f8_t
f8_convert_rne
<
f8_t
,
half_t
>
(
half_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// convert to float and use native converion
return
f8_convert_rne
<
f8_t
>
(
type_convert
<
float
>
(
x
));
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
standard
;
constexpr
uint32_t
rng
=
0
;
return
utils
::
cast_to_f8
<
half_t
,
f8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp32 to bf8 with rounding to nearest even
template
<
>
inline
__host__
__device__
bf8_t
f8_convert_rne
<
bf8_t
,
float
>
(
float
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
union
{
float
fval
;
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_pk_bf8_f32
(
val
.
fval
,
val
.
fval
,
ival
,
false
);
// false -> WORD0
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
standard
;
constexpr
uint32_t
rng
=
0
;
return
utils
::
cast_to_f8
<
float
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp16 to bf8 with rounding to nearest even
template
<
>
inline
__host__
__device__
bf8_t
f8_convert_rne
<
bf8_t
,
half_t
>
(
half_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// convert to float and use native converion
return
f8_convert_rne
<
bf8_t
>
(
type_convert
<
float
>
(
x
));
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
standard
;
constexpr
uint32_t
rng
=
0
;
return
utils
::
cast_to_f8
<
half_t
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp32 to fp8
template
<
>
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
float
>
(
float
x
)
{
#if defined CK_USE_SR_F8_CONVERSION
return
f8_convert_sr
<
f8_t
>
(
x
);
#else
return
f8_convert_rne
<
f8_t
>
(
x
);
#endif
}
// convert fp8 to fp32
// convert fp8 to fp32
template
<
>
template
<
>
inline
__host__
__device__
float
type_convert
<
float
,
f8_t
>
(
f8_t
x
)
inline
__host__
__device__
float
type_convert
<
float
,
f8_t
>
(
f8_t
x
)
...
@@ -174,17 +352,10 @@ inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
...
@@ -174,17 +352,10 @@ inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
template
<
>
template
<
>
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
half_t
>
(
half_t
x
)
inline
__host__
__device__
f8_t
type_convert
<
f8_t
,
half_t
>
(
half_t
x
)
{
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined CK_USE_SR_F8_CONVERSION
// convert to float and use native converion
return
f8_convert_sr
<
f8_t
>
(
x
);
return
type_convert
<
f8_t
>
(
type_convert
<
float
>
(
x
));
#else
#else
constexpr
bool
negative_zero_nan
=
true
;
return
f8_convert_nre
<
f8_t
>
(
x
);
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
standard
;
constexpr
uint32_t
rng
=
0
;
return
utils
::
cast_to_f8
<
half_t
,
f8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
#endif
}
}
...
@@ -205,26 +376,10 @@ inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
...
@@ -205,26 +376,10 @@ inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
template
<
>
template
<
>
inline
__host__
__device__
bf8_t
type_convert
<
bf8_t
,
float
>
(
float
x
)
inline
__host__
__device__
bf8_t
type_convert
<
bf8_t
,
float
>
(
float
x
)
{
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined CK_USE_SR_F8_CONVERSION
union
return
f8_convert_sr
<
bf8_t
>
(
x
);
{
float
fval
;
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_pk_bf8_f32
(
val
.
fval
,
val
.
fval
,
ival
,
false
);
// false -> WORD0
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
#else
#else
constexpr
bool
negative_zero_nan
=
true
;
return
f8_convert_rne
<
bf8_t
>
(
x
);
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
standard
;
constexpr
uint32_t
rng
=
0
;
return
utils
::
cast_to_f8
<
float
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
#endif
}
}
...
@@ -248,17 +403,10 @@ inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
...
@@ -248,17 +403,10 @@ inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
template
<
>
template
<
>
inline
__host__
__device__
bf8_t
type_convert
<
bf8_t
,
half_t
>
(
half_t
x
)
inline
__host__
__device__
bf8_t
type_convert
<
bf8_t
,
half_t
>
(
half_t
x
)
{
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined CK_USE_SR_F8_CONVERSION
// convert to float and use native converion
return
f8_convert_sr
<
bf8_t
>
(
x
);
return
type_convert
<
bf8_t
>
(
type_convert
<
float
>
(
x
));
#else
#else
constexpr
bool
negative_zero_nan
=
true
;
return
f8_convert_rne
<
bf8_t
>
(
x
);
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
standard
;
constexpr
uint32_t
rng
=
0
;
return
utils
::
cast_to_f8
<
half_t
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
#endif
}
}
...
@@ -331,104 +479,4 @@ inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(h
...
@@ -331,104 +479,4 @@ inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(h
return
bf16_convert_rtn
<
bhalf_t
>
(
x_fp32
);
return
bf16_convert_rtn
<
bhalf_t
>
(
x_fp32
);
}
}
// Declare a template function for fp8 conversion using SR
template
<
typename
Y
,
typename
X
>
__host__
__device__
constexpr
Y
f8_convert_sr
(
X
x
);
// convert fp32 to fp8 with stochastic rounding
template
<
>
inline
__host__
__device__
f8_t
f8_convert_sr
<
f8_t
,
float
>
(
float
x
)
{
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
union
{
float
fval
;
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_sr_fp8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
// little endian
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
return
utils
::
cast_to_f8
<
float
,
f8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp16 to fp8 with stochastic rounding
template
<
>
inline
__host__
__device__
f8_t
f8_convert_sr
<
f8_t
,
half_t
>
(
half_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// convert to float and use native converion
return
f8_convert_sr
<
f8_t
>
(
type_convert
<
float
>
(
x
));
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
half_t
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
return
utils
::
cast_to_f8
<
half_t
,
f8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp32 to bf8 with stochastic rounding
template
<
>
inline
__host__
__device__
bf8_t
f8_convert_sr
<
bf8_t
,
float
>
(
float
x
)
{
constexpr
int
seed
=
42
;
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
union
{
float
fval
;
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_sr_bf8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
// little endian
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
return
utils
::
cast_to_f8
<
float
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
// convert fp16 to bf8 with stochastic rounding
template
<
>
inline
__host__
__device__
bf8_t
f8_convert_sr
<
bf8_t
,
half_t
>
(
half_t
x
)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
// convert to float and use native converion
return
f8_convert_sr
<
f8_t
>
(
type_convert
<
float
>
(
x
));
#else
constexpr
bool
negative_zero_nan
=
true
;
constexpr
bool
clip
=
true
;
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
constexpr
int
seed
=
42
;
// as thread id is not available on host, use 0 for prn generation
uint32_t
rng
=
prand_generator
<
half_t
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
return
utils
::
cast_to_f8
<
half_t
,
bf8_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
#endif
}
}
// namespace ck
}
// namespace ck
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
0403be0e
...
@@ -58,7 +58,12 @@ endfunction(add_instance_library INSTANCE_NAME)
...
@@ -58,7 +58,12 @@ endfunction(add_instance_library INSTANCE_NAME)
file
(
GLOB dir_list LIST_DIRECTORIES true *
)
file
(
GLOB dir_list LIST_DIRECTORIES true *
)
set
(
CK_DEVICE_INSTANCES
)
set
(
CK_DEVICE_OTHER_INSTANCES
)
set
(
CK_DEVICE_GEMM_INSTANCES
)
set
(
CK_DEVICE_CONV_INSTANCES
)
set
(
CK_DEVICE_MHA_INSTANCES
)
set
(
CK_DEVICE_CONTRACTION_INSTANCES
)
set
(
CK_DEVICE_REDUCTION_INSTANCES
)
FOREACH
(
subdir_path
${
dir_list
}
)
FOREACH
(
subdir_path
${
dir_list
}
)
set
(
target_dir
)
set
(
target_dir
)
IF
(
IS_DIRECTORY
"
${
subdir_path
}
"
)
IF
(
IS_DIRECTORY
"
${
subdir_path
}
"
)
...
@@ -122,7 +127,19 @@ FOREACH(subdir_path ${dir_list})
...
@@ -122,7 +127,19 @@ FOREACH(subdir_path ${dir_list})
if
((
add_inst EQUAL 1
))
if
((
add_inst EQUAL 1
))
get_filename_component
(
target_dir
${
subdir_path
}
NAME
)
get_filename_component
(
target_dir
${
subdir_path
}
NAME
)
add_subdirectory
(
${
target_dir
}
)
add_subdirectory
(
${
target_dir
}
)
list
(
APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
if
(
"
${
cmake_instance
}
"
MATCHES
"gemm"
)
list
(
APPEND CK_DEVICE_GEMM_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"conv"
)
list
(
APPEND CK_DEVICE_CONV_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"mha"
)
list
(
APPEND CK_DEVICE_MHA_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"contr"
)
list
(
APPEND CK_DEVICE_CONTRACTION_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"reduce"
)
list
(
APPEND CK_DEVICE_REDUCTION_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
else
()
list
(
APPEND CK_DEVICE_OTHER_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
endif
()
message
(
"add_instance_directory
${
subdir_path
}
"
)
message
(
"add_instance_directory
${
subdir_path
}
"
)
else
()
else
()
message
(
"skip_instance_directory
${
subdir_path
}
"
)
message
(
"skip_instance_directory
${
subdir_path
}
"
)
...
@@ -130,18 +147,14 @@ FOREACH(subdir_path ${dir_list})
...
@@ -130,18 +147,14 @@ FOREACH(subdir_path ${dir_list})
ENDIF
()
ENDIF
()
ENDFOREACH
()
ENDFOREACH
()
add_library
(
device_operations STATIC
${
CK_DEVICE_INSTANCES
}
)
add_library
(
composablekernels::device_operations ALIAS device_operations
)
set
(
DEV_OPS_INC_DIRS
if
(
CK_DEVICE_OTHER_INSTANCES
)
${
PROJECT_SOURCE_DIR
}
/include/ck/
add_library
(
device_other_operations STATIC
${
CK_DEVICE_OTHER_INSTANCES
}
)
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
add_library
(
composablekernels::device_other_operations ALIAS device_other_operations
)
)
target_compile_features
(
device_other_operations PUBLIC
)
set_target_properties
(
device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_compile_features
(
device_operations PUBLIC
)
target_include_directories
(
device_other_operations PUBLIC
set_target_properties
(
device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_description>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_description>
...
@@ -157,23 +170,115 @@ target_include_directories(device_operations PUBLIC
...
@@ -157,23 +170,115 @@ target_include_directories(device_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/quantization>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/softmax>
)
rocm_install
(
TARGETS device_other_operations
EXPORT device_other_operationsTargets
)
rocm_install
(
EXPORT device_other_operationsTargets
FILE composable_kerneldevice_other_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_GEMM_INSTANCES
)
add_library
(
device_gemm_operations STATIC
${
CK_DEVICE_GEMM_INSTANCES
}
)
add_library
(
composablekernels::device_gemm_operations ALIAS device_gemm_operations
)
target_compile_features
(
device_gemm_operations PUBLIC
)
set_target_properties
(
device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_gemm_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
)
rocm_install
(
TARGETS device_gemm_operations
EXPORT device_gemm_operationsTargets
)
rocm_install
(
EXPORT device_gemm_operationsTargets
FILE composable_kerneldevice_gemm_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_CONV_INSTANCES
)
add_library
(
device_conv_operations STATIC
${
CK_DEVICE_CONV_INSTANCES
}
)
add_library
(
composablekernels::device_conv_operations ALIAS device_conv_operations
)
target_compile_features
(
device_conv_operations PUBLIC
)
set_target_properties
(
device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_conv_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd>
)
rocm_install
(
TARGETS device_conv_operations
EXPORT device_conv_operationsTargets
)
rocm_install
(
EXPORT device_conv_operationsTargets
FILE composable_kerneldevice_conv_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_MHA_INSTANCES
)
add_library
(
device_mha_operations STATIC
${
CK_DEVICE_MHA_INSTANCES
}
)
add_library
(
composablekernels::device_mha_operations ALIAS device_mha_operations
)
target_compile_features
(
device_mha_operations PUBLIC
)
set_target_properties
(
device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_mha_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/mha>
)
rocm_install
(
TARGETS device_mha_operations
EXPORT device_mha_operationsTargets
)
rocm_install
(
EXPORT device_mha_operationsTargets
FILE composable_kerneldevice_mha_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_CONTRACTION_INSTANCES
)
add_library
(
device_contraction_operations STATIC
${
CK_DEVICE_CONTRACTION_INSTANCES
}
)
add_library
(
composablekernels::device_contraction_operations ALIAS device_contraction_operations
)
target_compile_features
(
device_contraction_operations PUBLIC
)
set_target_properties
(
device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_contraction_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/contraction>
)
rocm_install
(
TARGETS device_contraction_operations
EXPORT device_contraction_operationsTargets
)
rocm_install
(
EXPORT device_contraction_operationsTargets
FILE composable_kerneldevice_contraction_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_REDUCTION_INSTANCES
)
add_library
(
device_reduction_operations STATIC
${
CK_DEVICE_REDUCTION_INSTANCES
}
)
add_library
(
composablekernels::device_reduction_operations ALIAS device_reduction_operations
)
target_compile_features
(
device_reduction_operations PUBLIC
)
set_target_properties
(
device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_reduction_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/reduce>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/reduce>
)
)
rocm_install
(
TARGETS device_reduction_operations
#once new arches are enabled make this an option on the main cmake file
EXPORT device_reduction_operationsTargets
)
# and pass down here to be exported
rocm_install
(
EXPORT device_reduction_operationsTargets
target_compile_options
(
device_operations PRIVATE
FILE composable_kerneldevice_reduction_operationsTargets.cmake
--offload-arch=gfx908
--offload-arch=gfx90a
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
rocm_install
(
TARGETS device_operations
EXPORT device_operationsTargets
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
rocm_install
(
EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
NAMESPACE composable_kernel::
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
add_library
(
device_operations INTERFACE
)
target_link_libraries
(
device_operations INTERFACE
device_contraction_operations
device_conv_operations
device_gemm_operations
device_other_operations
device_reduction_operations
utility
)
set
(
DEV_OPS_INC_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
)
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
profiler/include/profiler/profile_gemm_impl.hpp
View file @
0403be0e
...
@@ -166,7 +166,7 @@ int profile_gemm_impl(int do_verification,
...
@@ -166,7 +166,7 @@ int profile_gemm_impl(int do_verification,
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
avg_time
=
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
,
0
,
10
,
50
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
...
...
test/transpose/test_transpose_ut_cases.inc
View file @
0403be0e
...
@@ -14,7 +14,6 @@ TYPED_TEST(TestTranspose, Test1)
...
@@ -14,7 +14,6 @@ TYPED_TEST(TestTranspose, Test1)
this
->
Run
();
this
->
Run
();
}
}
TYPED_TEST
(
TestTranpose
,
Test2
)
TYPED_TEST
(
TestTranpose
,
Test2
)
{
{
std
::
vector
<
int
>
Ms
{
127
,
255
,
312
,
799
,
1573
};
std
::
vector
<
int
>
Ms
{
127
,
255
,
312
,
799
,
1573
};
...
@@ -27,4 +26,3 @@ TYPED_TEST(TestTranpose, Test2)
...
@@ -27,4 +26,3 @@ TYPED_TEST(TestTranpose, Test2)
this
->
Run
();
this
->
Run
();
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment