Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
8b83b087
Commit
8b83b087
authored
Jan 02, 2025
by
mtgu0705
Browse files
format some files
parent
f9435a75
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
40 additions
and
79 deletions
+40
-79
example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+0
-7
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
...n/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+5
-5
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+20
-52
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
...ration/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+6
-6
include/ck/utility/amd_inline_asm.hpp
include/ck/utility/amd_inline_asm.hpp
+1
-1
library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
...ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
+3
-3
library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
...16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+1
-1
profiler/include/profiler/profile_gemm_b_scale_impl.hpp
profiler/include/profiler/profile_gemm_b_scale_impl.hpp
+2
-2
profiler/src/profile_gemm_b_scale.cpp
profiler/src/profile_gemm_b_scale.cpp
+2
-2
No files found.
example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
View file @
8b83b087
...
@@ -45,13 +45,6 @@ using DeviceGemmV2Instance =
...
@@ -45,13 +45,6 @@ using DeviceGemmV2Instance =
// clang-format on
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
BDataType
,
CDataType
,
CDataType
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
View file @
8b83b087
...
@@ -6,16 +6,16 @@
...
@@ -6,16 +6,16 @@
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/flush_cache.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/flush_cache.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
8b83b087
...
@@ -41,74 +41,42 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
...
@@ -41,74 +41,42 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{})
=
amd_assembly_pk_fma_f16
(
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{})
=
amd_assembly_pk_fma_f16
(
bit_cast
<
half2_t
>
(
hi
),
bit_cast
<
half2_t
>
(
MUL
),
bit_cast
<
half2_t
>
(
ADD
));
bit_cast
<
half2_t
>
(
hi
),
bit_cast
<
half2_t
>
(
MUL
),
bit_cast
<
half2_t
>
(
ADD
));
asm
volatile
(
"v_pk_mul_f16 %0, %1, %2"
:
"=v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
0
>
{}))
:
"v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
0
>
{})),
"v"
(
scale
));
asm
volatile
(
"v_pk_mul_f16 %0, %1, %2"
:
"=v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{}))
:
"v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{})),
"v"
(
scale
));
return
res
.
template
AsType
<
half4_t
>()[
Number
<
0
>
{}];
return
res
.
template
AsType
<
half4_t
>()[
Number
<
0
>
{}];
}
}
// Further fuse the scale into inline assembly, sanity failed
__host__
__device__
inline
half4_t
pki4_to_half4_scale
(
int
q
,
const
ck
::
half2_t
&
scale
)
#if 0
__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half_t& scale)
{
{
constexpr int LO = 0x000f000f;
const
int
LO
=
0x000f000f
;
constexpr int HI = 0x00f000f0;
const
int
HI
=
0x00f000f0
;
constexpr int EX = 0x64006400;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
// int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
// Extract the two int4 at low bit and create two fp16 number.
// int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int
lo
=
amd_assembly_and_or_b32
(
q
,
LO
,
EX
);
int
lo
=
amd_assembly_and_or_b32
(
q
,
LO
,
EX
);
// Extract the two int4 at hight bit and create two fp16 number.
int
hi
=
amd_assembly_and_or_b32
(
q
,
HI
,
EX
);
int
hi
=
amd_assembly_and_or_b32
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0xE408E408
;
// half2 {-1032, -1032}
// constexpr int SUB = 0xE408E408; //-8
const
int
MUL
=
0x2c002c00
;
// half2 {1 / 16, 1 / 16}
// constexpr int MUL = 0x2c002c00; // 1/16
const
int
ADD
=
0xd480d480
;
// half2 {-72, -72}
// constexpr int ADD = 0xd480d480; //-79
constexpr half_t SUB = bit_cast<half_t>(static_cast<uint16_t>(0xE408));
constexpr half_t MUL = bit_cast<half_t>(static_cast<uint16_t>(0x2c00));
constexpr half_t ADD = bit_cast<half_t>(static_cast<uint16_t>(0xd480));
vector_type<half_t, 2> scale_2;
scale_2.template AsType<half_t>()(Number<0>{}) = scale;
scale_2.template AsType<half_t>()(Number<1>{}) = scale;
vector_type<half_t, 2> sub_2;
sub_2.template AsType<half_t>()(Number<0>{}) = SUB * scale;
sub_2.template AsType<half_t>()(Number<1>{}) = SUB * scale;
vector_type<half_t, 2> mul_2;
mul_2.template AsType<half_t>()(Number<0>{}) = MUL * scale;
mul_2.template AsType<half_t>()(Number<1>{}) = MUL * scale;
vector_type<half_t, 2> add_2;
add_2.template AsType<half_t>()(Number<0>{}) = ADD * scale;
add_2.template AsType<half_t>()(Number<1>{}) = ADD * scale;
vector_type
<
half_t
,
4
>
res
;
vector_type
<
half_t
,
4
>
res
;
res
.
template
AsType
<
half2_t
>()(
Number
<
0
>
{})
=
res
.
template
AsType
<
half2_t
>()(
Number
<
0
>
{})
=
amd_assembly_pk_fma_f16(bit_cast<half2_t>(lo),
amd_assembly_pk_add_f16
(
bit_cast
<
half2_t
>
(
lo
),
bit_cast
<
half2_t
>
(
SUB
));
scale_2.template AsType<half2_t>()(Number<0>{}),
sub_2.template AsType<half2_t>()(Number<0>{}));
res.template AsType<half2_t>()(Number<1>{}) =
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{})
=
amd_assembly_pk_fma_f16
(
amd_assembly_pk_fma_f16(bit_cast<half2_t>(hi),
bit_cast
<
half2_t
>
(
hi
),
bit_cast
<
half2_t
>
(
MUL
),
bit_cast
<
half2_t
>
(
ADD
));
mul_2.template AsType<half2_t>()(Number<0>{}),
add_2.template AsType<half2_t>()(Number<0>{}));
//
asm volatile("v_pk_mul_f16 %0, %1, %2"
asm
volatile
(
"v_pk_mul_f16 %0, %1, %2"
//
: "=v"(res.template AsType<half2_t>()(Number<0>{}))
:
"=v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
0
>
{}))
//
: "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));
:
"v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
0
>
{})),
"v"
(
scale
));
//
asm volatile("v_pk_mul_f16 %0, %1, %2"
asm
volatile
(
"v_pk_mul_f16 %0, %1, %2"
//
: "=v"(res.template AsType<half2_t>()(Number<1>{}))
:
"=v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{}))
//
: "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));
:
"v"
(
res
.
template
AsType
<
half2_t
>()(
Number
<
1
>
{})),
"v"
(
scale
));
return
res
.
template
AsType
<
half4_t
>()[
Number
<
0
>
{}];
return
res
.
template
AsType
<
half4_t
>()[
Number
<
0
>
{}];
}
}
#endif
__host__
__device__
inline
half2_t
pki4_to_half2
(
pk_i4_t
q
)
__host__
__device__
inline
half2_t
pki4_to_half2
(
pk_i4_t
q
)
{
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
View file @
8b83b087
...
@@ -3,16 +3,16 @@
...
@@ -3,16 +3,16 @@
#pragma once
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/utility/common_header.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -29,7 +29,7 @@ template <typename GridwiseGemm,
...
@@ -29,7 +29,7 @@ template <typename GridwiseGemm,
TailNumber
TailNum
=
TailNumber
::
Full
>
TailNumber
TailNum
=
TailNumber
::
Full
>
__global__
void
__global__
void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
MinimumOccupancy
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
MinimumOccupancy
)
#endif
#endif
// __attribute__((amdgpu_waves_per_eu(1, 1)))
// __attribute__((amdgpu_waves_per_eu(1, 1)))
kernel_gemm_xdl_cshuffle_v3
(
typename
GridwiseGemm
::
Argument
karg
)
kernel_gemm_xdl_cshuffle_v3
(
typename
GridwiseGemm
::
Argument
karg
)
...
@@ -59,7 +59,7 @@ template <typename GridwiseGemm,
...
@@ -59,7 +59,7 @@ template <typename GridwiseGemm,
TailNumber
TailNum
=
TailNumber
::
Full
>
TailNumber
TailNum
=
TailNumber
::
Full
>
__global__
void
__global__
void
#if CK_USE_LAUNCH_BOUNDS
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
MinimumOccupancy
)
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
MinimumOccupancy
)
#endif
#endif
// __attribute__((amdgpu_waves_per_eu(1, 1)))
// __attribute__((amdgpu_waves_per_eu(1, 1)))
kernel_gemm_xdl_cshuffle_v3_2lds
(
typename
GridwiseGemm
::
Argument
karg
)
kernel_gemm_xdl_cshuffle_v3_2lds
(
typename
GridwiseGemm
::
Argument
karg
)
...
@@ -1926,7 +1926,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
...
@@ -1926,7 +1926,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
1
,
1
,
false
>
(
false
>
(
b_scale_grid_desc_bn_ak
,
b_scale_grid_desc_bn_ak
,
make_multi_index
(
block_n_id
*
NPerBlock
/
ScaleBlockN
+
b_thread_offset_n
,
make_multi_index
(
block_n_id
*
NPerBlock
/
ScaleBlockN
+
b_thread_offset_n
,
b_thread_offset_k
/
ScaleBlockK
));
b_thread_offset_k
/
ScaleBlockK
));
constexpr
auto
b_scale_thread_slice_copy_step
=
constexpr
auto
b_scale_thread_slice_copy_step
=
...
...
include/ck/utility/amd_inline_asm.hpp
View file @
8b83b087
...
@@ -4,8 +4,8 @@
...
@@ -4,8 +4,8 @@
#ifndef CK_AMD_INLINE_ASM_HPP
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
#include "data_type.hpp"
#include "c_style_pointer_cast.hpp"
#include "c_style_pointer_cast.hpp"
#include "data_type.hpp"
// TODO: deprecate all amd_assembly_outer_product_xxx
// TODO: deprecate all amd_assembly_outer_product_xxx
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
View file @
8b83b087
...
@@ -3,12 +3,12 @@
...
@@ -3,12 +3,12 @@
#pragma once
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include <memory>
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
...
...
library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
View file @
8b83b087
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...
...
profiler/include/profiler/profile_gemm_b_scale_impl.hpp
View file @
8b83b087
...
@@ -8,18 +8,18 @@
...
@@ -8,18 +8,18 @@
#include <typeinfo>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
namespace
ck
{
namespace
profiler
{
namespace
profiler
{
...
...
profiler/src/profile_gemm_b_scale.cpp
View file @
8b83b087
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_b_scale_impl.hpp"
#include "profiler/profile_gemm_b_scale_impl.hpp"
#include "profiler_operation_registry.hpp"
#include "profiler_operation_registry.hpp"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment