Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
hytlass
Commits
d22dbec2
Commit
d22dbec2
authored
Dec 09, 2025
by
zhoux
Browse files
Initial commit: release hytlass-0.1.0
parents
Changes
124
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
9067 additions
and
0 deletions
+9067
-0
examples/11_hytlass_tensorop_group_conv/gfx928_tensorop_group_conv2dfprop.cu
..._tensorop_group_conv/gfx928_tensorop_group_conv2dfprop.cu
+681
-0
examples/12_depthwise_simt_conv2dfprop/CMakeLists.txt
examples/12_depthwise_simt_conv2dfprop/CMakeLists.txt
+36
-0
examples/12_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
..._depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
+669
-0
examples/13_hytlass_tensorop_fused_conv2d_fprop/CMakeLists.txt
...les/13_hytlass_tensorop_fused_conv2d_fprop/CMakeLists.txt
+37
-0
examples/13_hytlass_tensorop_fused_conv2d_fprop/gfx928_tensorop_conv2d_bias_add_relu.cu
...used_conv2d_fprop/gfx928_tensorop_conv2d_bias_add_relu.cu
+786
-0
examples/13_hytlass_tensorop_fused_conv2d_fprop/gfx928_tensorop_conv2d_bias_relu.cu
...op_fused_conv2d_fprop/gfx928_tensorop_conv2d_bias_relu.cu
+772
-0
examples/14_gather_scatter_fusion/CMakeLists.txt
examples/14_gather_scatter_fusion/CMakeLists.txt
+38
-0
examples/14_gather_scatter_fusion/gather_k_fusion.cu
examples/14_gather_scatter_fusion/gather_k_fusion.cu
+525
-0
examples/14_gather_scatter_fusion/gather_scatter_fusion.cu
examples/14_gather_scatter_fusion/gather_scatter_fusion.cu
+530
-0
examples/15_hute_group_gemm/CMakeLists.txt
examples/15_hute_group_gemm/CMakeLists.txt
+33
-0
examples/15_hute_group_gemm/hute_gfx928_group_gemm.cu
examples/15_hute_group_gemm/hute_gfx928_group_gemm.cu
+702
-0
examples/16_hytlass_gemm_softmax/CMakeLists.txt
examples/16_hytlass_gemm_softmax/CMakeLists.txt
+33
-0
examples/16_hytlass_gemm_softmax/gemm_softmax.cu
examples/16_hytlass_gemm_softmax/gemm_softmax.cu
+690
-0
examples/16_hytlass_gemm_softmax/gemm_with_epilogue_visitor.h
...ples/16_hytlass_gemm_softmax/gemm_with_epilogue_visitor.h
+536
-0
examples/16_hytlass_gemm_softmax/gemm_with_softmax.h
examples/16_hytlass_gemm_softmax/gemm_with_softmax.h
+663
-0
examples/17_ell_block_sparse_gemm/CMakeLists.txt
examples/17_ell_block_sparse_gemm/CMakeLists.txt
+38
-0
examples/17_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
examples/17_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
+729
-0
examples/17_ell_block_sparse_gemm/ell_block_sparse_gemm_bias_relu.cu
..._ell_block_sparse_gemm/ell_block_sparse_gemm_bias_relu.cu
+749
-0
examples/18_gemm_with_abs_max/CMakeLists.txt
examples/18_gemm_with_abs_max/CMakeLists.txt
+34
-0
examples/18_gemm_with_abs_max/gemm_with_absmax.cu
examples/18_gemm_with_abs_max/gemm_with_absmax.cu
+786
-0
No files found.
Too many changes to show.
To preserve performance only
124 of 124+
files are displayed.
Plain diff
Email patch
examples/11_hytlass_tensorop_group_conv/gfx928_tensorop_group_conv2dfprop.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run group convolution kernels using functions and data structures
provided by HYTLASS using tensor cores.
There are 2 group conv mode:
1. hytlass::conv::GroupMode::kSingleGroup
This mode is for large K problem size: k_per_group (K/groups) equals or larger than
threadblock_tile_N. One or multiple threadblocks calculate data of one group.
2. hytlass::conv::GroupMode::kMultipleGroup
This mode is for small K problem size: k_per_group (K/groups) is smaller than threadblock_tile_N.
One threadblock will calculate data from more than one group.
Function profile_convolution_selecter() shows how to choose kernel with different group mode according
to problem size and threadblock_tile size.
*/
#include <iostream>
#include <sstream>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/conv/kernel/default_conv2d_group_fprop.h"
#include "hytlass/conv/device/implicit_gemm_convolution.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/convolution.h"
#include "hytlass/util/reference/device/convolution.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using
ElementAccumulator
=
float
;
// Data type of accumulator
using
ElementComputeEpilogue
=
float
;
// Data type of epilogue computation (alpha, beta)
using
ElementInputA
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementInputB
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementOutput
=
float
;
// Data type of elements in output tensor
using
LayoutInputA
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutInputB
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutOutput
=
hytlass
::
layout
::
TensorNHWC
;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using
MMAOp
=
hytlass
::
arch
::
OpClassTensorOp
;
// This code section describes hip Gfx architecture number
using
SmArch
=
hytlass
::
arch
::
Gfx928
;
// This code section describes the tile size a thread block will compute
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
128
,
128
,
32
>
;
// Threadblock tile shape
// This code section describes tile size a warp will compute
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
64
,
64
,
32
>
;
// Warp tile shape
// This code section describes the size of MMA op
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
;
// TensorCore instruction shape
// This code section describes how threadblocks are scheduled on GPU
using
SwizzleThreadBlock
=
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<>
;
// Number of pipelines you want to use
constexpr
int
NumStages
=
1
;
// This code section describes the epilogue part of the kernel, we use default value
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombination
<
ElementOutput
,
// Data type of output matrix.
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
// The number of elements per vectorized.
// memory access. This becomes the vector width of
// math instructions in the epilogue too.
ElementAccumulator
,
// Data type of accumulator
ElementComputeEpilogue
>
;
// Data type for alpha/beta in linear combination
// Analytic kernel and operation for single group problem size
using
AnalyticSingleGroupKernel
=
typename
hytlass
::
conv
::
kernel
::
DefaultConv2dGroupFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
conv
::
GroupMode
::
kSingleGroup
,
hytlass
::
conv
::
IteratorAlgorithm
::
kAnalytic
>::
Kernel
;
using
AnalyticSingleGroupOperation
=
hytlass
::
conv
::
device
::
ImplicitGemmConvolution
<
AnalyticSingleGroupKernel
>
;
// Analytic kernel and operation for multiple group problem size
using
AnalyticMultipleGroupKernel
=
typename
hytlass
::
conv
::
kernel
::
DefaultConv2dGroupFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
conv
::
GroupMode
::
kMultipleGroup
,
hytlass
::
conv
::
IteratorAlgorithm
::
kAnalytic
>::
Kernel
;
using
AnalyticMultipleGroupOperation
=
hytlass
::
conv
::
device
::
ImplicitGemmConvolution
<
AnalyticMultipleGroupKernel
>
;
// Optimized kernel and operation for single group problem size
using
OptimizedSingleGroupKernel
=
typename
hytlass
::
conv
::
kernel
::
DefaultConv2dGroupFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
conv
::
GroupMode
::
kSingleGroup
,
hytlass
::
conv
::
IteratorAlgorithm
::
kOptimized
>::
Kernel
;
using
OptimizedSingleGroupOperation
=
hytlass
::
conv
::
device
::
ImplicitGemmConvolution
<
OptimizedSingleGroupKernel
>
;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
Tensor4DCoord
input_size
;
hytlass
::
Tensor4DCoord
filter_size
;
hytlass
::
Tensor4DCoord
padding
;
hytlass
::
MatrixCoord
conv_stride
;
hytlass
::
MatrixCoord
dilation
;
int
groups
;
bool
reference_check
;
bool
measure_performance
;
int
iterations
;
ElementComputeEpilogue
alpha
;
ElementComputeEpilogue
beta
;
bool
optimized
;
std
::
string
tag
;
Options
()
:
help
(
false
),
input_size
(
1
,
32
,
32
,
32
),
filter_size
(
32
,
3
,
3
,
32
),
padding
(
1
,
1
,
1
,
1
),
conv_stride
(
1
,
1
),
dilation
(
1
,
1
),
groups
(
1
),
reference_check
(
true
),
measure_performance
(
true
),
iterations
(
20
),
alpha
(
1
),
beta
(
0
),
optimized
(
false
)
{
}
// Verify the problem size is compatible with the HYTLASS Convolution implementation.
bool
valid
()
{
//
// HYTLASS attempts to load 128b vectors of hytlass::half_t (F16) elements. Consequently,
// all pointers, strides, and tensor extents must be divisible by 8 elements.
//
int
const
kAlignment
=
128
/
hytlass
::
sizeof_bits
<
ElementInputA
>::
value
;
if
((
input_size
.
c
()
%
kAlignment
)
||
(
filter_size
.
n
()
%
kAlignment
))
{
// misaligned tensors
return
false
;
}
// Invalid padding
if
((
padding
.
h
()
!=
filter_size
.
h
()
/
2
)
||
(
padding
.
w
()
!=
filter_size
.
w
()
/
2
))
{
return
false
;
}
return
true
;
}
/// Updates input and filter sizes
void
update
(
hytlass
::
Tensor4DCoord
input_size
,
hytlass
::
Tensor4DCoord
filter_size
)
{
this
->
input_size
=
input_size
;
this
->
filter_size
=
filter_size
;
padding
.
n
()
=
filter_size
.
h
()
/
2
;
padding
.
h
()
=
filter_size
.
h
()
/
2
;
padding
.
w
()
=
filter_size
.
w
()
/
2
;
padding
.
c
()
=
filter_size
.
w
()
/
2
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"ref-check"
))
{
reference_check
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"perf-check"
))
{
measure_performance
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"optimized"
))
{
optimized
=
true
;
}
cmd
.
get_cmd_line_argument
(
"n"
,
input_size
.
n
());
cmd
.
get_cmd_line_argument
(
"h"
,
input_size
.
h
());
cmd
.
get_cmd_line_argument
(
"w"
,
input_size
.
w
());
cmd
.
get_cmd_line_argument
(
"c"
,
input_size
.
c
());
cmd
.
get_cmd_line_argument
(
"k"
,
filter_size
.
n
());
cmd
.
get_cmd_line_argument
(
"r"
,
filter_size
.
h
());
cmd
.
get_cmd_line_argument
(
"s"
,
filter_size
.
w
());
cmd
.
get_cmd_line_argument
(
"g"
,
groups
);
filter_size
.
c
()
=
input_size
.
c
()
/
groups
;
cmd
.
get_cmd_line_argument
(
"u"
,
conv_stride
.
row
());
cmd
.
get_cmd_line_argument
(
"v"
,
conv_stride
.
column
());
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
cmd
.
get_cmd_line_argument
(
"tag"
,
tag
);
if
(
filter_size
.
h
()
==
3
&&
filter_size
.
w
()
==
3
)
{
padding
=
{
1
,
1
,
1
,
1
};
}
else
{
filter_size
.
h
()
=
1
;
filter_size
.
w
()
=
1
;
padding
=
{
0
,
0
,
0
,
0
};
}
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"11_hytlass_tensorop_group_conv example
\n\n
"
<<
" This example uses Ampere's Tensor Core operators on F16 data types to compute
\n
"
<<
" forward grouped convolution on tensors of layout NHWC.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --n=<int> Input tensor extent N
\n
"
<<
" --h=<int> Input tensor extent H
\n
"
<<
" --w=<int> Input tensor extent W
\n
"
<<
" --c=<int> Input tensor extent C
\n
"
<<
" --k=<int> Filter extent K
\n
"
<<
" --r=<int> Filter extent R
\n
"
<<
" --s=<int> Filter extent S
\n\n
"
<<
" --g=<int> Conv groups G
\n\n
"
<<
" --u=<int> Conv stride_h
\n\n
"
<<
" --v=<int> Conv stride_w
\n\n
"
<<
" --alpha=<float> Epilogue scalar alpha
\n
"
<<
" --beta=<float> Epilogue scalar beta
\n\n
"
<<
" --ref-check If set (true), reference check is computed
\n
"
<<
" --perf-check If set (true), performance is measured.
\n
"
<<
" --optimized If set (true), use optimized kernel, otherwise use analytic kernel.
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n
"
<<
" --tag=<string> String to replicate across the first column in the results table
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./examples/11_hytlass_tensorop_group_conv/gfx928_tensorop_group_conv2dfprop --n=4 --h=16 --w=16 --c=256 --k=128 --r=3 --s=3 --g=8 --ref-check
\n\n
"
<<
"$ ./examples/11_hytlass_tensorop_group_conv/gfx928_tensorop_group_conv2dfprop --n=4 --h=16 --w=16 --c=256 --k=128 --r=3 --s=3 --g=2 --ref-check
\n\n
"
<<
"$ ./examples/11_hytlass_tensorop_group_conv/gfx928_tensorop_group_conv2dfprop --n=4 --h=16 --w=16 --c=256 --k=128 --r=3 --s=3 --g=2 --ref-check --optimized
\n\n
"
;
return
out
;
}
/// Computes the output tensor size (NPQK)
hytlass
::
Tensor4DCoord
output_size
()
const
{
return
hytlass
::
Tensor4DCoord
(
input_size
.
n
(),
(
input_size
.
h
()
+
padding
.
n
()
+
padding
.
h
()
-
filter_size
.
h
())
/
conv_stride
.
row
()
+
1
,
(
input_size
.
w
()
+
padding
.
w
()
+
padding
.
c
()
-
filter_size
.
w
())
/
conv_stride
.
column
()
+
1
,
filter_size
.
n
());
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of multiply-adds = NPQK * CRS
int64_t
fmas
=
output_size
().
product
()
*
int64_t
(
filter_size
.
h
()
*
filter_size
.
w
()
*
filter_size
.
c
());
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hytlass
::
Status
reference_check
;
hipError_t
error
;
Result
()
:
runtime_ms
(
0
),
gflops
(
0
),
status
(
hytlass
::
Status
::
kSuccess
),
reference_check
(
hytlass
::
Status
::
kInvalid
),
error
(
hipSuccess
)
{
}
static
std
::
ostream
&
print_header
(
std
::
ostream
&
out
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
"Name,"
;
}
out
<<
"Layer,N,H,W,C,K,R,S,G,Runtime,GFLOPs"
;
return
out
;
}
std
::
ostream
&
print
(
std
::
ostream
&
out
,
int
idx
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
options
.
tag
<<
","
;
}
out
<<
"conv_"
<<
idx
<<
","
<<
options
.
input_size
.
n
()
<<
","
<<
options
.
input_size
.
h
()
<<
","
<<
options
.
input_size
.
w
()
<<
","
<<
options
.
input_size
.
c
()
<<
","
<<
options
.
filter_size
.
n
()
<<
","
<<
options
.
filter_size
.
h
()
<<
","
<<
options
.
filter_size
.
w
()
<<
","
<<
options
.
groups
<<
","
<<
runtime_ms
<<
","
<<
gflops
;
return
out
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Runs one benchmark
template
<
typename
Conv2dOperation
>
Result
profile_convolution
(
Options
const
&
options
)
{
Result
result
;
//
// Allocate host-device tensors using the HYTLASS Utilities.
//
hytlass
::
HostTensor
<
ElementInputA
,
LayoutInputA
>
tensor_a
(
options
.
input_size
);
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b
(
options
.
filter_size
);
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_c
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_ref_d
(
options
.
output_size
());
//
// Initialize tensors
//
// Fill tensor A on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_a
.
host_view
(),
1
,
ElementInputA
(
7
),
ElementInputA
(
-
8
),
0
);
// Fill tensor B on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_b
.
host_view
(),
1
,
ElementInputB
(
7
),
ElementInputB
(
-
8
),
0
);
// Fill tensor C on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_c
.
host_view
(),
1
,
ElementOutput
(
7
),
ElementOutput
(
-
8
),
0
);
// Fill tensor D on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d
.
host_view
());
// Fill tensor D for reference on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_ref_d
.
host_view
());
// Copy data from host to GPU
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_c
.
sync_device
();
tensor_d
.
sync_device
();
tensor_ref_d
.
sync_device
();
//
// Define arguments for HYTLASS Convolution
//
hytlass
::
conv
::
Mode
mode
=
hytlass
::
conv
::
Mode
::
kCrossCorrelation
;
// Split K dimension into 1 partitions
int
split_k_slices
=
1
;
// Construct Conv2dProblemSize with user defined output size
hytlass
::
conv
::
Conv2dProblemSize
problem_size
(
options
.
input_size
,
options
.
filter_size
,
options
.
padding
,
options
.
conv_stride
,
options
.
dilation
,
options
.
output_size
(),
mode
,
split_k_slices
,
options
.
groups
);
// Construct Conv2dOperation::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename
Conv2dOperation
::
Arguments
arguments
{
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_c
.
device_ref
(),
tensor_d
.
device_ref
(),
{
options
.
alpha
,
options
.
beta
},
};
//
// Initialize HYTLASS Convolution
//
Conv2dOperation
implicit_gemm_op
;
size_t
workspace_size
=
implicit_gemm_op
.
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
result
.
status
=
implicit_gemm_op
.
can_implement
(
arguments
);
HYTLASS_CHECK
(
result
.
status
);
result
.
status
=
implicit_gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
HYTLASS_CHECK
(
result
.
status
);
//
// Launch initialized HYTLASS kernel
//
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
//
// Optional reference check
//
if
(
options
.
reference_check
)
{
std
::
cout
<<
"Verification on device...
\n
"
;
// Compute with reference implementation
hytlass
::
reference
::
device
::
Conv2dFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementComputeEpilogue
,
ElementAccumulator
,
hytlass
::
NumericConverter
<
ElementOutput
,
ElementComputeEpilogue
>
>
(
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_c
.
device_ref
(),
tensor_ref_d
.
device_ref
(),
options
.
alpha
,
options
.
beta
);
tensor_ref_d
.
sync_host
();
// Check if output from HYTLASS kernel and reference kernel are equal or not
tensor_d
.
sync_host
();
bool
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
tensor_d
.
host_view
(),
tensor_ref_d
.
host_view
());
if
(
!
passed
)
{
result
.
reference_check
=
hytlass
::
Status
::
kErrorInternal
;
std
::
cout
<<
"ERROR - results miscompared.
\n
"
;
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kSuccess
;
std
::
cout
<<
"Passed.
\n
"
;
}
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kInvalid
;
}
//
// Performance measurement
//
if
(
options
.
measure_performance
)
{
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
}
// Record an event at the start of a series of convolution operations.
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Launch a sequence of implicit GEMM operations on the device
for
(
int
iteration
=
0
;
iteration
<
options
.
iterations
;
++
iteration
)
{
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
}
// Record an event when the convolutions have been launched.
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Print average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
}
return
result
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
Result
profile_convolution_selecter
(
Options
const
&
options
)
{
int
k_per_group
=
options
.
filter_size
.
n
()
/
options
.
groups
;
// In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
if
(
k_per_group
<
ThreadblockShape
::
kN
)
{
// MultipleGroup mode
if
(
options
.
optimized
)
{
std
::
cerr
<<
"Invalid problem: optimized group conv kernel doesn't support MultipleGroup (one CTA calculate multiple groups) mode"
<<
std
::
endl
;
exit
(
-
1
);
}
else
{
std
::
cout
<<
"Select AnalyticMultipleGroupOperation
\n
"
;
return
profile_convolution
<
AnalyticMultipleGroupOperation
>
(
options
);
}
}
else
{
// SingleGroup mode
if
(
options
.
optimized
)
{
std
::
cout
<<
"Select OptimizedSingleGroupOperation
\n
"
;
return
profile_convolution
<
OptimizedSingleGroupOperation
>
(
options
);
}
else
{
std
::
cout
<<
"Select AnalyticSingleGroupOperation
\n
"
;
return
profile_convolution
<
AnalyticSingleGroupOperation
>
(
options
);
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
// Execute one problem size
if
(
!
options
.
valid
())
{
std
::
cerr
<<
"Invalid problem."
<<
std
::
endl
;
return
-
1
;
}
Result
result
=
profile_convolution_selecter
(
options
);
Result
::
print_header
(
std
::
cout
,
options
)
<<
std
::
endl
;
result
.
print
(
std
::
cout
,
1
,
options
)
<<
std
::
endl
;
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/12_depthwise_simt_conv2dfprop/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
depthwise_simt_conv2dfprop
depthwise_simt_conv2dfprop.cu
)
examples/12_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run depthwise 2d convolution kernels using functions and data structures
provided by HYTLASS using SIMT instruction;
There are 3 types of implementations of depthwise 2d convoltion
1. kAnalytic
Implicit gemm 2d convoltion algorithm.
2. kOptimized
An optimized algorithm and supports arbitrary stride and dilation.
3. kFixedStrideDilation
An optimized algorithm with fixed stride and dilation to reduce the runtime computation and do
more optimizations.
In general, the perf of kFixedStrideDilation would be better than kOptimized. However, if the filter
size, stride or dilation is large, it would encounter register spilling and may hurt the perf. If
in this case, please use kOptimized.
For kOptimized and kFixedStrideDilation, in order to fully utilize GPU hardware resources and achieve
better perf, when the output tensor size is large, splitk should be enabled to achieve better perf.
In this example, it demonstrates how to construct and run a FixedStrideDilation depthwise 2d
convolution kernel.
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/conv/kernel/default_depthwise_fprop.h"
#include "hytlass/conv/device/implicit_gemm_convolution.h"
#include "hytlass/conv/device/direct_convolution.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/convolution.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using
ElementAccumulator
=
hytlass
::
half_t
;
// Data type of accumulator
using
ElementComputeEpilogue
=
hytlass
::
half_t
;
// Data type of epilogue computation (alpha, beta)
using
ElementInputA
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementInputB
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementOutput
=
hytlass
::
half_t
;
// Data type of elements in output tensor
using
LayoutInputA
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutInputB
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutOutput
=
hytlass
::
layout
::
TensorNHWC
;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using
MMAOp
=
hytlass
::
arch
::
OpClassSimt
;
// This code section describes hip Gfx architecture number
using
SmArch
=
hytlass
::
arch
::
Gfx906
;
// This code section describes the groups a thread block will compute
constexpr
int
groups_per_cta
=
64
;
// This code section describes the output tile <N, O, P, Q> a thread block will compute
using
ThreadBlockOutputShape
=
hytlass
::
conv
::
TensorNHWCShape
<
1
,
8
,
8
,
groups_per_cta
>
;
// This code section describes the filter shape <R, S>
using
FilterShape
=
hytlass
::
MatrixShape
<
3
,
3
>
;
// Threadblock tile shape
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
ThreadBlockOutputShape
::
kNHW
,
groups_per_cta
,
FilterShape
::
kCount
>
;
// This code section describes tile size a warp will computes
// WarpShape::kM = P * Q the warps would process
// WarpShape::kN = groups_per_cta that the warps would process
// WarpShape::kK = filter_size that the warps would process
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
groups_per_cta
,
FilterShape
::
kCount
>
;
// This code section describes the size of MMA op
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
1
,
1
,
1
>
;
// This code section describes how threadblocks are scheduled on GPU
using
SwizzleThreadBlock
=
hytlass
::
conv
::
threadblock
::
DepthwiseDirect2dConvIdentityThreadblockSwizzle
<
1
,
ThreadBlockOutputShape
::
kN
,
ThreadBlockOutputShape
::
kH
,
ThreadBlockOutputShape
::
kW
>
;
// Number of pipelines you want to use
constexpr
int
NumStages
=
2
;
// This code section describe iterator algorithm selected is kFixedStrideDilation
static
hytlass
::
conv
::
IteratorAlgorithm
const
IteratorAlgorithm
=
hytlass
::
conv
::
IteratorAlgorithm
::
kFixedStrideDilation
;
using
StrideShape
=
hytlass
::
MatrixShape
<
1
,
1
>
;
using
DilationShape
=
hytlass
::
MatrixShape
<
1
,
1
>
;
constexpr
int
kEpilogueElementsPerAccess
=
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
;
// This code section describes the epilogue part of the kernel, we use default value
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombination
<
ElementOutput
,
// Data type of output matrix.
kEpilogueElementsPerAccess
,
// The number of elements per vectorized.
// memory access. This becomes the vector width of
// math instructions in the epilogue too.
ElementAccumulator
,
// Data type of accumulator
ElementComputeEpilogue
,
// Data type for alpha/beta in linear combination
hytlass
::
epilogue
::
thread
::
ScaleType
::
OnlyAlphaScaling
>
;
// Epilogue scaling operation.
using
DepthwiseDirect2dConv
=
typename
hytlass
::
conv
::
kernel
::
DefaultDepthwiseDirect2dConvFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ThreadblockShape
,
ThreadBlockOutputShape
,
FilterShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
hytlass
::
arch
::
OpMultiplyAdd
,
IteratorAlgorithm
,
hytlass
::
conv
::
StrideSupport
::
kFixed
,
StrideShape
,
DilationShape
>::
Kernel
;
using
Direct2dConv
=
hytlass
::
conv
::
device
::
DirectConvolution
<
DepthwiseDirect2dConv
>
;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
Tensor4DCoord
input_size
;
hytlass
::
Tensor4DCoord
filter_size
;
hytlass
::
Tensor4DCoord
padding
;
hytlass
::
MatrixCoord
conv_stride
;
hytlass
::
MatrixCoord
dilation
;
int
groups
;
int
splitk
;
bool
reference_check
;
bool
measure_performance
;
int
iterations
;
bool
save_workspace
;
ElementComputeEpilogue
alpha
;
ElementComputeEpilogue
beta
;
std
::
string
tag
;
Options
()
:
help
(
false
),
input_size
(
1
,
128
,
128
,
32
),
filter_size
(
32
,
3
,
3
,
1
),
groups
(
32
),
padding
(
1
,
1
,
1
,
1
),
conv_stride
(
1
,
1
),
dilation
(
1
,
1
),
reference_check
(
true
),
measure_performance
(
true
),
iterations
(
20
),
save_workspace
(
false
),
alpha
(
1
),
beta
(
0
),
splitk
(
1
)
{}
// Verify the problem size is compatible with the HYTLASS Convolution implementation.
bool
valid
()
{
//
// HYTLASS attempts to load 128b vectors of hytlass::half_t (F16) elements. Consequently,
// all pointers, strides, and tensor extents must be divisible by 8 elements.
//
int
const
kAlignment
=
8
;
if
((
input_size
.
c
()
%
kAlignment
)
||
(
filter_size
.
n
()
%
kAlignment
))
{
// misaligned tensors
return
false
;
}
// depthwise conv
if
(
groups
!=
input_size
.
c
())
{
return
false
;
}
if
(
filter_size
.
n
()
!=
groups
)
{
return
false
;
}
// Invalid padding
if
((
padding
.
h
()
!=
filter_size
.
h
()
/
2
)
||
(
padding
.
w
()
!=
filter_size
.
w
()
/
2
))
{
return
false
;
}
// Filter size passed through command line does not match filter size template parameter
if
(
filter_size
.
h
()
!=
FilterShape
::
kRow
||
filter_size
.
w
()
!=
FilterShape
::
kColumn
)
{
std
::
cerr
<<
"Filter size passed in ("
<<
filter_size
.
h
()
<<
"x"
<<
filter_size
.
w
()
<<
") "
<<
"must match the FilterShape template parameter of the convolution "
<<
"("
<<
FilterShape
::
kRow
<<
"x"
<<
FilterShape
::
kColumn
<<
"). "
<<
"To use the filter shape passed in, change the FilterShape template "
<<
"parameter and recompile this example."
<<
std
::
endl
;
return
false
;
}
return
true
;
}
/// Updates input and filter sizes
void
update
(
hytlass
::
Tensor4DCoord
input_size
,
hytlass
::
Tensor4DCoord
filter_size
)
{
this
->
input_size
=
input_size
;
this
->
filter_size
=
filter_size
;
padding
.
n
()
=
filter_size
.
h
()
/
2
;
padding
.
h
()
=
filter_size
.
h
()
/
2
;
padding
.
w
()
=
filter_size
.
w
()
/
2
;
padding
.
c
()
=
filter_size
.
w
()
/
2
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"ref-check"
))
{
reference_check
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"perf-check"
))
{
measure_performance
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"save-workspace"
))
{
save_workspace
=
true
;
}
cmd
.
get_cmd_line_argument
(
"n"
,
input_size
.
n
());
cmd
.
get_cmd_line_argument
(
"h"
,
input_size
.
h
());
cmd
.
get_cmd_line_argument
(
"w"
,
input_size
.
w
());
cmd
.
get_cmd_line_argument
(
"c"
,
input_size
.
c
());
cmd
.
get_cmd_line_argument
(
"k"
,
filter_size
.
n
());
cmd
.
get_cmd_line_argument
(
"r"
,
filter_size
.
h
());
cmd
.
get_cmd_line_argument
(
"s"
,
filter_size
.
w
());
cmd
.
get_cmd_line_argument
(
"g"
,
groups
);
filter_size
.
c
()
=
1
;
filter_size
.
n
()
=
input_size
.
c
();
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
);
cmd
.
get_cmd_line_argument
(
"splitk"
,
splitk
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
cmd
.
get_cmd_line_argument
(
"tag"
,
tag
);
int32_t
padding_h
=
filter_size
.
h
()
/
2
;
int32_t
padding_w
=
filter_size
.
w
()
/
2
;
padding
=
{
padding_h
,
padding_h
,
padding_w
,
padding_w
};
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"12_depthwise_simt_conv2dfprop example
\n\n
"
<<
" This example uses Ampere's Tensor Core operators on F16 data types to compute
\n
"
<<
" forward convolution on tensors of layout NHWC.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --n=<int> Input tensor extent N
\n
"
<<
" --h=<int> Input tensor extent H
\n
"
<<
" --w=<int> Input tensor extent W
\n
"
<<
" --c=<int> Input tensor extent C
\n
"
<<
" --k=<int> Filter extent K
\n
"
<<
" --r=<int> Filter extent R
\n
"
<<
" --s=<int> Filter extent S
\n\n
"
<<
" --g=<int> Groups
\n\n
"
<<
" --alpha=<float> Epilogue scalar alpha
\n
"
<<
" --beta=<float> Epilogue scalar beta
\n\n
"
<<
" --splitk=<int> Enable splitK
\n\n
"
<<
" --ref-check If set (true), reference check on the host is computed
\n
"
<<
" --perf-check If set (true), performance is measured.
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n
"
<<
" --save-workspace If set, workspace is written to a text file.
\n
"
<<
" --tag=<string> String to replicate across the first column in the results "
"table
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./examples/12_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop --n=32 "
"--h=224 --w=224 --c=128 --k=128 --g=128 --r=3 --s=3
\n\n
"
<<
"$ ./examples/12_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop --n=1 "
"--h=224 --w=224 --c=32 --k=32 --g=32 --r=3 --s=3 --splitk=10 --ref-check
\n\n
"
;
return
out
;
}
/// Computes the output tensor size (NPQK)
hytlass
::
Tensor4DCoord
output_size
()
const
{
return
hytlass
::
Tensor4DCoord
(
input_size
.
n
(),
(
input_size
.
h
()
+
padding
.
n
()
+
padding
.
h
()
-
filter_size
.
h
())
/
conv_stride
.
row
()
+
1
,
(
input_size
.
w
()
+
padding
.
w
()
+
padding
.
c
()
-
filter_size
.
w
())
/
conv_stride
.
column
()
+
1
,
filter_size
.
n
());
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of multiply-adds = NPQK * CRS
int64_t
fmas
=
output_size
().
product
()
*
int64_t
(
filter_size
.
h
()
*
filter_size
.
w
()
*
filter_size
.
c
());
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hytlass
::
Status
reference_check
;
hipError_t
error
;
Result
()
:
runtime_ms
(
0
),
gflops
(
0
),
status
(
hytlass
::
Status
::
kSuccess
),
reference_check
(
hytlass
::
Status
::
kInvalid
),
error
(
hipSuccess
)
{}
static
std
::
ostream
&
print_header
(
std
::
ostream
&
out
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
"Name,"
;
}
out
<<
"Layer,N,H,W,C,K,R,S,G,stride_h,stride_w,dilation_h,dilation_w,splitK,Runtime,GFLOPs"
;
return
out
;
}
std
::
ostream
&
print
(
std
::
ostream
&
out
,
int
idx
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
options
.
tag
<<
","
;
}
hytlass
::
Tensor4DCoord
output_size
=
options
.
output_size
();
out
<<
"conv_"
<<
idx
<<
","
<<
options
.
input_size
.
n
()
<<
","
<<
options
.
input_size
.
h
()
<<
","
<<
options
.
input_size
.
w
()
<<
","
<<
options
.
input_size
.
c
()
<<
","
<<
options
.
filter_size
.
n
()
<<
","
<<
options
.
filter_size
.
h
()
<<
","
<<
options
.
filter_size
.
w
()
<<
","
<<
options
.
groups
<<
","
<<
options
.
conv_stride
.
row
()
<<
","
<<
options
.
conv_stride
.
column
()
<<
","
<<
options
.
dilation
.
row
()
<<
","
<<
options
.
dilation
.
column
()
<<
","
<<
options
.
splitk
<<
","
<<
runtime_ms
<<
","
<<
gflops
;
return
out
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Runs one testcase
Result
profile_convolution
(
Options
const
&
options
)
{
Result
result
;
//
// Allocate host-device tensors using the HYTLASS Utilities.
//
hytlass
::
HostTensor
<
ElementInputA
,
LayoutInputA
>
tensor_a
(
options
.
input_size
);
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b
(
options
.
filter_size
);
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b_transpose
(
options
.
filter_size
);
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_c
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_ref_d
(
options
.
output_size
());
//
// Initialize tensors
//
// Fill tensor A on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_a
.
host_view
(),
1
,
ElementInputA
(
5
),
ElementInputA
(
-
6
),
0
);
// Fill tensor B on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_b
.
host_view
(),
1
,
ElementInputB
(
3
),
ElementInputB
(
-
6
),
0
);
// Fill tensor C on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_c
.
host_view
(),
1
,
ElementOutput
(
5
),
ElementOutput
(
-
6
),
0
);
// Fill tensor D on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d
.
host_view
());
// Fill tensor D for reference on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_ref_d
.
host_view
());
// Copy data from host to GPU
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_b_transpose
.
sync_device
();
tensor_c
.
sync_device
();
tensor_d
.
sync_device
();
tensor_ref_d
.
sync_device
();
//
// Define arguments for HYTLASS Convolution
//
hytlass
::
conv
::
Mode
mode
=
hytlass
::
conv
::
Mode
::
kCrossCorrelation
;
// Split P*Q into multiple CTA
int
split_k_slices
=
options
.
splitk
;
// Construct Conv2dProblemSize with user defined output size
hytlass
::
conv
::
Conv2dProblemSize
problem_size
(
options
.
input_size
,
options
.
filter_size
,
options
.
padding
,
options
.
conv_stride
,
options
.
dilation
,
options
.
output_size
(),
mode
,
split_k_slices
,
options
.
groups
);
// Construct Direc2dConv::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename
Direct2dConv
::
Arguments
arguments
{
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_c
.
device_ref
(),
tensor_d
.
device_ref
(),
{
options
.
alpha
,
options
.
beta
},
tensor_b_transpose
.
device_ref
()};
//
// Initialize HYTLASS Convolution
//
Direct2dConv
implicit_gemm_op
;
size_t
workspace_size
=
implicit_gemm_op
.
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
result
.
status
=
implicit_gemm_op
.
can_implement
(
arguments
);
HYTLASS_CHECK
(
result
.
status
);
result
.
status
=
implicit_gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
HYTLASS_CHECK
(
result
.
status
);
//
// Launch initialized HYTLASS kernel
//
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
//
// Optional reference check
//
if
(
options
.
reference_check
)
{
std
::
cout
<<
"Verification on host...
\n
"
;
// Compute with reference implementation
hytlass
::
reference
::
host
::
Conv2dFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementComputeEpilogue
,
ElementAccumulator
>
(
problem_size
,
tensor_a
.
host_ref
(),
tensor_b
.
host_ref
(),
tensor_c
.
host_ref
(),
tensor_ref_d
.
host_ref
(),
options
.
alpha
,
options
.
beta
);
// Check if output from HYTLASS kernel and reference kernel are equal or not
tensor_d
.
sync_host
();
bool
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
tensor_d
.
host_view
(),
tensor_ref_d
.
host_view
());
if
(
!
passed
)
{
result
.
reference_check
=
hytlass
::
Status
::
kErrorInternal
;
std
::
cout
<<
"ERROR - results miscompared.
\n
"
;
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kSuccess
;
std
::
cout
<<
"Passed.
\n
"
;
}
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kInvalid
;
}
if
(
options
.
save_workspace
)
{
std
::
stringstream
ss
;
ss
<<
"46_depthwise_simt_conv2dfprop"
<<
options
.
input_size
.
n
()
<<
"x"
<<
options
.
input_size
.
h
()
<<
"x"
<<
options
.
input_size
.
w
()
<<
"x"
<<
options
.
input_size
.
c
()
<<
"_"
<<
options
.
filter_size
.
n
()
<<
"x"
<<
options
.
filter_size
.
h
()
<<
"x"
<<
options
.
filter_size
.
w
()
<<
"x"
<<
options
.
filter_size
.
c
()
<<
".dat"
;
std
::
ofstream
output_workspace
(
ss
.
str
());
output_workspace
<<
"Input =
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n\n
"
<<
"Filters =
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n\n
"
;
if
(
options
.
reference_check
)
{
output_workspace
<<
"Reference =
\n
"
<<
tensor_ref_d
.
host_view
()
<<
"
\n\n
"
;
}
output_workspace
<<
"Computed =
\n
"
<<
tensor_d
.
host_view
()
<<
std
::
endl
;
std
::
cout
<<
"Results written to '"
<<
ss
.
str
()
<<
"'."
<<
std
::
endl
;
}
//
// Performance measurement
//
if
(
options
.
measure_performance
)
{
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
}
// Record an event at the start of a series of convolution operations.
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Launch a sequence of implicit GEMM operations on the device
for
(
int
iteration
=
0
;
iteration
<
options
.
iterations
;
++
iteration
)
{
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
}
// Record an event when the convolutions have been launched.
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Print average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
}
return
result
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
// Execute one problem size
if
(
!
options
.
valid
())
{
std
::
cerr
<<
"Invalid problem."
<<
std
::
endl
;
return
-
1
;
}
Result
result
=
profile_convolution
(
options
);
Result
::
print_header
(
std
::
cout
,
options
)
<<
std
::
endl
;
result
.
print
(
std
::
cout
,
1
,
options
)
<<
std
::
endl
;
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/13_hytlass_tensorop_fused_conv2d_fprop/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
gfx928_tensorop_conv2d_bias_relu
gfx928_tensorop_conv2d_bias_relu.cu
)
hytlass_example_add_executable
(
gfx928_tensorop_conv2d_bias_add_relu
gfx928_tensorop_conv2d_bias_add_relu.cu
)
examples/13_hytlass_tensorop_fused_conv2d_fprop/gfx928_tensorop_conv2d_bias_add_relu.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run convolution kernels using functions and data structures
provided by HYTLASS using tensor cores.
Writing a single high performance convolution kernel is hard but do-able. Whereas writing
high performance kernels at scale which works for multiple problem sizes with good abstractions is
really hard. HYTLASS solves this problem by providing simplified abstractions to compose
multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
of GPU easily.
HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
and thread-block level, they compute on their own tile-size with higher level of tile sizes being
composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In thie example, we split variable initialization into
1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set tensors will be used to compute
output of convolution.
First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In HYTLASS,
the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as
alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as
epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to
ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit
signed integer. But int4b_t is not fully supported by HYGON software stack, so HYTLASS introduces
hytlass::int4b_t. We use the data type for elements in input tensor A and B as hytlass::int4b_t. We
convey this to HYTLASS kernel by initializing template variables ElementAccumulator (int32_t),
ElementComputeEpilogue (float), ElementInputA (hytlass::int4b_t), ElementInputB (hytlass::int4b_t),
ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out
linearly in memory, we have to convey the layout of tensors. We do that by initializing template
variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC hytlass variable. Next, we setup
rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of
elements per vector memory access (32), data type of accumulator (int32_t) and data type of
computation of linear combination (alpha * X + beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate HYTLASS Implicit GEMM kernel, it
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
data in bank-conflict free manner, and ton of other variables required to compose, initialize and
launch a high performance Implicit GEMM kernel. This is the beauty of HYTLASS, it relieves developer
from understanding and coding complicated hardware optimizations which can easily go wrong.
HYTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
constitute the whole process of loading input data from global memory to shared memory, loading data
from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
sequence shows a typical mma pipeline.
tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
output to global memory
The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
until the previous finished executing. There are stages in the pipeline which do not have fixed
latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
Finally, the pipeline in a kernel looks like
(1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) tensor in global
memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
(9) output to global memory
This way, you can hide the second global memory load latency by doing computation on already loaded
input data.
There are few more template variables initialized such as, which threadblock tile of output matrix
is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
These are all put together to create a template variable which describes HYTLASS Implicit GEMM
kernel using hytlass::conv::device::ImplicitGemm template.
The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
We use HYTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
in the way of learning HYTLASS.
Once all the tensors are initialized and filled with data, create arguments tuple to launch HYTLASS
kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to initialize HYTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference convolution kernel (from HYTLASS utilities) to
compare if the output from HYTLASS kernel is same as the reference implicit GEMM kernel.
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
#include "hytlass/conv/device/implicit_gemm_convolution.h"
#include "hytlass/epilogue/thread/linear_combination_residual_block.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/device/convolution.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using
ElementAccumulator
=
float
;
// Data type of accumulator
using
ElementComputeEpilogue
=
float
;
// Data type of epilogue computation (alpha, beta)
using
ElementInputA
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementInputB
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementOutput
=
float
;
// Data type of elements in output tensor
using
LayoutInputA
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutInputB
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutOutput
=
hytlass
::
layout
::
TensorNHWC
;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using
MMAOp
=
hytlass
::
arch
::
OpClassTensorOp
;
// This code section describes GFX architecture number
using
SmArch
=
hytlass
::
arch
::
Gfx928
;
// This code section describes the tile size a thread block will compute
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
64
,
64
,
128
>
;
// Threadblock tile shape
// This code section describes tile size a warp will compute
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
32
,
32
,
128
>
;
// Warp tile shape
// This code section describes the size of MMA opS
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
32
>
;
// TensorCore instruction shape
// This code section describes how threadblocks are scheduled on GPU
using
SwizzleThreadBlock
=
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<
4
>
;
// 1 -> singlestage
// 2 -> pipelined
constexpr
int
NumStages
=
1
;
// This code section describes the epilogue part of the kernel, we use default value
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombinationResidualBlock
<
ElementOutput
,
ElementAccumulator
,
ElementComputeEpilogue
,
ElementOutput
,
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
hytlass
::
epilogue
::
thread
::
Identity
,
hytlass
::
plus
,
hytlass
::
epilogue
::
thread
::
ReLu
>
;
using
Conv2dFpropKernel
=
typename
hytlass
::
conv
::
kernel
::
DefaultConv2dFpropWithBroadcast
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
conv
::
IteratorAlgorithm
::
kOptimized
,
hytlass
::
conv
::
StrideSupport
::
kStrided
,
8
,
8
>::
Kernel
;
using
ImplicitGemm
=
hytlass
::
conv
::
device
::
ImplicitGemmConvolution
<
Conv2dFpropKernel
>
;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
Tensor4DCoord
input_size
;
hytlass
::
Tensor4DCoord
filter_size
;
hytlass
::
Tensor4DCoord
padding
;
hytlass
::
MatrixCoord
conv_stride
;
hytlass
::
MatrixCoord
dilation
;
bool
reference_check
;
bool
measure_performance
;
int
iterations
;
bool
save_workspace
;
ElementComputeEpilogue
alpha
;
ElementComputeEpilogue
beta
;
bool
benchmark
;
std
::
string
tag
;
Options
()
:
help
(
false
),
input_size
(
1
,
14
,
14
,
1024
),
filter_size
(
256
,
1
,
1
,
1024
),
padding
(
0
,
0
,
0
,
0
),
conv_stride
(
1
,
1
),
dilation
(
1
,
1
),
reference_check
(
true
),
measure_performance
(
true
),
iterations
(
20
),
save_workspace
(
false
),
alpha
(
1
),
beta
(
1
),
benchmark
(
false
)
{
}
// Verify the problem size is compatible with the HYTLASS Convolution implementation.
bool
valid
()
{
//
// HYTLASS attempts to load 128b vectors. Consequently,
// all pointers, strides, and tensor extents must be divisible by 32 elements.
//
// int const kAlignment = 128 / hytlass::sizeof_bits<ElementInputA>::value;
// if ((input_size.c() % kAlignment) ||
// (filter_size.n() % kAlignment)) {
// // misaligned tensors
// return false;
// }
// Invalid padding
if
((
padding
.
h
()
!=
filter_size
.
h
()
/
2
)
||
(
padding
.
w
()
!=
filter_size
.
w
()
/
2
))
{
return
false
;
}
return
true
;
}
/// Updates input and filter sizes
void
update
(
hytlass
::
Tensor4DCoord
input_size
,
hytlass
::
Tensor4DCoord
filter_size
)
{
this
->
input_size
=
input_size
;
this
->
filter_size
=
filter_size
;
padding
.
n
()
=
filter_size
.
h
()
/
2
;
padding
.
h
()
=
filter_size
.
h
()
/
2
;
padding
.
w
()
=
filter_size
.
w
()
/
2
;
padding
.
c
()
=
filter_size
.
w
()
/
2
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"ref-check"
))
{
reference_check
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"perf-check"
))
{
measure_performance
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"save-workspace"
))
{
save_workspace
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"benchmark"
))
{
benchmark
=
false
;
}
cmd
.
get_cmd_line_argument
(
"n"
,
input_size
.
n
());
cmd
.
get_cmd_line_argument
(
"h"
,
input_size
.
h
());
cmd
.
get_cmd_line_argument
(
"w"
,
input_size
.
w
());
cmd
.
get_cmd_line_argument
(
"c"
,
input_size
.
c
());
cmd
.
get_cmd_line_argument
(
"k"
,
filter_size
.
n
());
cmd
.
get_cmd_line_argument
(
"r"
,
filter_size
.
h
());
cmd
.
get_cmd_line_argument
(
"s"
,
filter_size
.
w
());
filter_size
.
c
()
=
input_size
.
c
();
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
cmd
.
get_cmd_line_argument
(
"tag"
,
tag
);
int32_t
padding_h
=
filter_size
.
h
()
/
2
;
int32_t
padding_w
=
filter_size
.
w
()
/
2
;
padding
=
{
padding_h
,
padding_h
,
padding_w
,
padding_w
};
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"13_hytlass_tensorop_fused_conv2d_fprop example
\n\n
"
<<
" This example uses Turing's Tensor Core operators on int4 data types to compute
\n
"
<<
" forward convolution on tensors of layout NHWC.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --n=<int> Input tensor extent N
\n
"
<<
" --h=<int> Input tensor extent H
\n
"
<<
" --w=<int> Input tensor extent W
\n
"
<<
" --c=<int> Input tensor extent C
\n
"
<<
" --k=<int> Filter extent K
\n
"
<<
" --r=<int> Filter extent R
\n
"
<<
" --s=<int> Filter extent S
\n\n
"
<<
" --alpha=<float> Epilogue scalar alpha
\n
"
<<
" --beta=<float> Epilogue scalar beta
\n\n
"
<<
" --ref-check If set (true), reference check on the host is computed
\n
"
<<
" --perf-check If set (true), performance is measured.
\n
"
<<
" --benchmark If set (true), performance benchmarking on several layers and batch-size.
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n
"
<<
" --save-workspace If set, workspace is written to a text file.
\n
"
<<
" --tag=<string> String to replicate across the first column in the results table
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./gfx928_tensorop_conv2d_bias_add_relu --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1
\n\n
"
<<
"$ ./gfx928_tensorop_conv2d_bias_add_relu --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check
\n\n
"
;
return
out
;
}
/// Computes the output tensor size (NPQK)
hytlass
::
Tensor4DCoord
output_size
()
const
{
return
hytlass
::
Tensor4DCoord
(
input_size
.
n
(),
(
input_size
.
h
()
+
padding
.
n
()
+
padding
.
h
()
-
filter_size
.
h
())
/
conv_stride
.
row
()
+
1
,
(
input_size
.
w
()
+
padding
.
w
()
+
padding
.
c
()
-
filter_size
.
w
())
/
conv_stride
.
column
()
+
1
,
filter_size
.
n
());
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of multiply-adds = NPQK * CRS
int64_t
fmas
=
output_size
().
product
()
*
int64_t
(
filter_size
.
h
()
*
filter_size
.
w
()
*
filter_size
.
c
());
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hytlass
::
Status
reference_check
;
hipError_t
error
;
Result
()
:
runtime_ms
(
0
),
gflops
(
0
),
status
(
hytlass
::
Status
::
kSuccess
),
reference_check
(
hytlass
::
Status
::
kInvalid
),
error
(
hipSuccess
)
{
}
static
std
::
ostream
&
print_header
(
std
::
ostream
&
out
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
"Name,"
;
}
out
<<
"Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"
;
return
out
;
}
std
::
ostream
&
print
(
std
::
ostream
&
out
,
int
idx
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
options
.
tag
<<
","
;
}
out
<<
"conv_"
<<
idx
<<
","
<<
options
.
input_size
.
n
()
<<
","
<<
options
.
input_size
.
h
()
<<
","
<<
options
.
input_size
.
w
()
<<
","
<<
options
.
input_size
.
c
()
<<
","
<<
options
.
filter_size
.
n
()
<<
","
<<
options
.
filter_size
.
h
()
<<
","
<<
options
.
filter_size
.
w
()
<<
","
<<
runtime_ms
<<
","
<<
gflops
;
return
out
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Runs one benchmark
Result
profile_convolution
(
Options
const
&
options
)
{
Result
result
;
//
// Allocate host-device tensors using the HYTLASS Utilities.
//
hytlass
::
HostTensor
<
ElementInputA
,
LayoutInputA
>
tensor_a
(
options
.
input_size
);
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b
(
options
.
filter_size
);
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_c_bias
({
1
,
1
,
1
,
options
.
output_size
().
c
()});
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_res
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_ref_d
(
options
.
output_size
());
//
// Initialize tensors
//
// // Fill tensor A on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_a
.
host_view
(),
1
,
ElementInputA
(
4
),
ElementInputA
(
-
4
),
4
);
// Fill tensor B on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_b
.
host_view
(),
2
,
ElementInputB
(
5
),
ElementInputB
(
-
5
),
5
);
//<- Fill matrix C on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_c_bias
.
host_view
(),
1
,
ElementOutput
(
1
),
ElementOutput
(
-
1
),
0
);
// Fill tensor D on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d
.
host_view
());
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_res
.
host_view
(),
1
,
ElementOutput
(
1
),
ElementOutput
(
-
1
),
0
);
// Fill tensor D for reference on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_ref_d
.
host_view
());
// Copy data from host to GPU
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_c_bias
.
sync_device
();
tensor_d
.
sync_device
();
tensor_res
.
sync_device
();
tensor_ref_d
.
sync_device
();
//
// Define arguments for HYTLASS Convolution
//
// mode (kCrossCorrelation or kConvolution)
hytlass
::
conv
::
Mode
mode
=
hytlass
::
conv
::
Mode
::
kCrossCorrelation
;
// Split K dimension into 1 partitions
int
split_k_slices
=
1
;
float
alpha
=
1
;
float
beta
=
1
;
// Construct Conv2dProblemSize with user defined output size
hytlass
::
conv
::
Conv2dProblemSize
problem_size
(
options
.
input_size
,
options
.
filter_size
,
options
.
padding
,
options
.
conv_stride
,
options
.
dilation
,
options
.
output_size
(),
mode
,
split_k_slices
);
// Construct ImplicitGemm::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename
ImplicitGemm
::
Arguments
arguments
{
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_res
.
device_ref
(),
tensor_d
.
device_ref
(),
{
alpha
,
beta
},
hytlass
::
conv
::
SplitKMode
::
kNone
,
(
hytlass
::
half_t
*
)(
tensor_c_bias
.
device_data
()),
nullptr
,
0
,
options
.
output_size
().
c
()
};
//
// Initialize HYTLASS Convolution
//
ImplicitGemm
implicit_gemm_op
;
size_t
workspace_size
=
implicit_gemm_op
.
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
result
.
status
=
implicit_gemm_op
.
can_implement
(
arguments
);
HYTLASS_CHECK
(
result
.
status
);
result
.
status
=
implicit_gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
HYTLASS_CHECK
(
result
.
status
);
//
// Launch initialized HYTLASS kernel
//
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
//
// Optional reference check
//
if
(
options
.
reference_check
)
{
std
::
cout
<<
"Verification on host...
\n
"
;
// Compute with reference implementation
hytlass
::
reference
::
device
::
Conv2dFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementComputeEpilogue
,
ElementAccumulator
,
hytlass
::
NumericConverter
<
ElementOutput
,
ElementComputeEpilogue
>
>
(
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_c_bias
.
device_ref
(),
tensor_ref_d
.
device_ref
(),
1
,
0
);
// Check if output from HYTLASS kernel and reference kernel are equal or not
tensor_d
.
sync_host
();
tensor_ref_d
.
sync_host
();
// Compute bias + add+relu in host code
for
(
int
n
=
0
;
n
<
problem_size
.
N
;
++
n
)
{
for
(
int
p
=
0
;
p
<
problem_size
.
P
;
++
p
)
{
for
(
int
q
=
0
;
q
<
problem_size
.
Q
;
++
q
)
{
for
(
int
k
=
0
;
k
<
problem_size
.
K
;
++
k
)
{
tensor_ref_d
.
at
({
n
,
p
,
q
,
k
})
=
std
::
max
(
ElementOutput
(
0
),
ElementOutput
(
tensor_ref_d
.
at
({
n
,
p
,
q
,
k
})
+
tensor_c_bias
.
at
({
0
,
0
,
0
,
k
})
+
tensor_res
.
at
({
n
,
p
,
q
,
k
})));
}
}
}
}
const
ElementOutput
non_zero_floor
(
1e-6
f
);
ElementOutput
eps
(
1e-3
f
);
bool
passed
=
hytlass
::
reference
::
host
::
TensorRelativelyEquals
(
tensor_d
.
host_view
(),
tensor_ref_d
.
host_view
(),
eps
,
non_zero_floor
);
if
(
!
passed
)
{
result
.
reference_check
=
hytlass
::
Status
::
kErrorInternal
;
std
::
cout
<<
"ERROR - results miscompared.
\n
"
;
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kSuccess
;
std
::
cout
<<
"Passed.
\n
"
;
}
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kInvalid
;
}
if
(
options
.
save_workspace
)
{
std
::
stringstream
ss
;
ss
<<
"09_tensor_conv_workspace_conv2dfprop_"
<<
options
.
input_size
.
n
()
<<
"x"
<<
options
.
input_size
.
h
()
<<
"x"
<<
options
.
input_size
.
w
()
<<
"x"
<<
options
.
input_size
.
c
()
<<
"_"
<<
options
.
filter_size
.
n
()
<<
"x"
<<
options
.
filter_size
.
h
()
<<
"x"
<<
options
.
filter_size
.
w
()
<<
"x"
<<
options
.
filter_size
.
c
()
<<
".dat"
;
std
::
ofstream
output_workspace
(
ss
.
str
());
output_workspace
<<
"Input =
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n\n
"
<<
"Filters =
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n\n
"
;
if
(
options
.
reference_check
)
{
output_workspace
<<
"Reference =
\n
"
<<
tensor_ref_d
.
host_view
()
<<
"
\n\n
"
;
}
output_workspace
<<
"Computed =
\n
"
<<
tensor_d
.
host_view
()
<<
std
::
endl
;
std
::
cout
<<
"Results written to '"
<<
ss
.
str
()
<<
"'."
<<
std
::
endl
;
}
//
// Performance measurement
//
if
(
options
.
measure_performance
)
{
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
}
// Record an event at the start of a series of convolution operations.
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Launch a sequence of implicit GEMM operations on the device
for
(
int
iteration
=
0
;
iteration
<
options
.
iterations
;
++
iteration
)
{
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
}
// Record an event when the convolutions have been launched.
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Print average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
}
return
result
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
if
(
options
.
benchmark
)
{
// Benchmark several layers
int
batch_sizes
[]
=
{
1
,
32
,
64
,
128
,
256
,
512
};
struct
Benchmark
{
int
h
,
w
,
c
,
k
,
r
,
s
;
}
layers
[]
=
{
{
56
,
56
,
64
,
256
,
1
,
1
},
{
56
,
56
,
64
,
64
,
1
,
1
},
{
56
,
56
,
64
,
64
,
3
,
3
},
{
56
,
56
,
256
,
64
,
1
,
1
},
{
56
,
56
,
256
,
512
,
1
,
1
},
{
56
,
56
,
256
,
128
,
1
,
1
},
{
28
,
28
,
128
,
128
,
3
,
3
},
{
28
,
28
,
128
,
512
,
1
,
1
},
{
28
,
28
,
512
,
128
,
1
,
1
},
{
28
,
28
,
512
,
1024
,
1
,
1
},
{
28
,
28
,
512
,
256
,
1
,
1
},
{
14
,
14
,
256
,
256
,
3
,
3
},
{
14
,
14
,
256
,
1024
,
1
,
1
},
{
14
,
14
,
1024
,
256
,
1
,
1
},
{
14
,
14
,
1024
,
2048
,
1
,
1
},
{
14
,
14
,
1024
,
512
,
1
,
1
},
{
7
,
7
,
512
,
512
,
3
,
3
},
};
Result
::
print_header
(
std
::
cout
,
options
)
<<
std
::
endl
;
int
idx
=
1
;
for
(
auto
const
&
layer
:
layers
)
{
for
(
auto
N
:
batch_sizes
)
{
options
.
update
({
N
,
layer
.
h
,
layer
.
w
,
layer
.
c
},
{
layer
.
k
,
layer
.
r
,
layer
.
s
,
layer
.
c
});
Result
result
=
profile_convolution
(
options
);
result
.
print
(
std
::
cout
,
idx
,
options
)
<<
std
::
endl
;
}
++
idx
;
}
}
else
{
// Execute one problem size
if
(
!
options
.
valid
())
{
std
::
cerr
<<
"Invalid problem."
<<
std
::
endl
;
return
-
1
;
}
Result
result
=
profile_convolution
(
options
);
Result
::
print_header
(
std
::
cout
,
options
)
<<
std
::
endl
;
result
.
print
(
std
::
cout
,
1
,
options
)
<<
std
::
endl
;
}
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/13_hytlass_tensorop_fused_conv2d_fprop/gfx928_tensorop_conv2d_bias_relu.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
This example shows how to run convolution kernels using functions and data structures
provided by HYTLASS using tensor cores.
Writing a single high performance convolution kernel is hard but do-able. Whereas writing
high performance kernels at scale which works for multiple problem sizes with good abstractions is
really hard. HYTLASS solves this problem by providing simplified abstractions to compose
multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance
of GPU easily.
HYTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp
and thread-block level, they compute on their own tile-size with higher level of tile sizes being
composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used
to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
threadblock-tile (tile size computed by a threadblock).
In thie example, we split variable initialization into
1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
can view them (logical to physical mapping)
2. Setting up computation properties : describes how the above set tensors will be used to compute
output of convolution.
First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along
with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In HYTLASS,
the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as
alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as
epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to
ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit
signed integer. But int4b_t is not fully supported by HYGON software stack, so HYTLASS introduces
hytlass::int4b_t. We use the data type for elements in input tensor A and B as hytlass::int4b_t. We
convey this to HYTLASS kernel by initializing template variables ElementAccumulator (int32_t),
ElementComputeEpilogue (float), ElementInputA (hytlass::int4b_t), ElementInputB (hytlass::int4b_t),
ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out
linearly in memory, we have to convey the layout of tensors. We do that by initializing template
variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC hytlass variable. Next, we setup
rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template
variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of
elements per vector memory access (32), data type of accumulator (int32_t) and data type of
computation of linear combination (alpha * X + beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128,
64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate HYTLASS Implicit GEMM kernel, it
internally deduces the amount of threads needed per thread-block, amount of shared memory, storing
data in bank-conflict free manner, and ton of other variables required to compose, initialize and
launch a high performance Implicit GEMM kernel. This is the beauty of HYTLASS, it relieves developer
from understanding and coding complicated hardware optimizations which can easily go wrong.
HYTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines
constitute the whole process of loading input data from global memory to shared memory, loading data
from shared memory to registers, doing matrix multiplication, store to global memory. The below flow
sequence shows a typical mma pipeline.
tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers ->
output to global memory
The problem with single pipeline is, each stage is synchronous which means, each stage has to wait
until the previous finished executing. There are stages in the pipeline which do not have fixed
latency, for example, the loads from global memory and shared memory. Therefore, we can add one more
pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads.
Finally, the pipeline in a kernel looks like
(1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5)
mma -> (6) registers -> (7) output to global memory (1) <null> -> (2) <null> -> (3) tensor in global
memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers ->
(9) output to global memory
This way, you can hide the second global memory load latency by doing computation on already loaded
input data.
There are few more template variables initialized such as, which threadblock tile of output matrix
is done which threadblock launched on an SM, GFX architecture of GPU you want to run on.
These are all put together to create a template variable which describes HYTLASS Implicit GEMM
kernel using hytlass::conv::device::ImplicitGemm template.
The next step is to initialize physical data, instantiate and initialize HYTLASS kernel and run it.
We use HYTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come
in the way of learning HYTLASS.
Once all the tensors are initialized and filled with data, create arguments tuple to launch HYTLASS
kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64,
R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the
important one, split k-dimension factor. Along with that, we query HYTLASS if any scratch-space
memory required by the kernel we instantiated. If yes, we create it and pass it along with other
arguments created to initialize HYTLASS kernel then, the kernel is launched.
In this example, we later on launch a reference convolution kernel (from HYTLASS utilities) to
compare if the output from HYTLASS kernel is same as the reference implicit GEMM kernel.
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm.h"
#include "hytlass/conv/kernel/default_conv2d_fprop.h"
#include "hytlass/conv/device/implicit_gemm_convolution.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/convolution.h"
#include "hytlass/util/reference/device/convolution.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using
ElementAccumulator
=
float
;
// Data type of accumulator
using
ElementComputeEpilogue
=
float
;
// Data type of epilogue computation (alpha, beta)
using
ElementInputA
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementInputB
=
hytlass
::
half_t
;
// Data type of elements in input tensor
using
ElementOutput
=
float
;
// Data type of elements in output tensor
using
LayoutInputA
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutInputB
=
hytlass
::
layout
::
TensorNHWC
;
using
LayoutOutput
=
hytlass
::
layout
::
TensorNHWC
;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using
MMAOp
=
hytlass
::
arch
::
OpClassTensorOp
;
// This code section describes GFX architecture number
using
SmArch
=
hytlass
::
arch
::
Gfx928
;
// This code section describes the tile size a thread block will compute
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
64
,
64
,
128
>
;
// Threadblock tile shape
// This code section describes tile size a warp will compute
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
32
,
32
,
128
>
;
// Warp tile shape
// This code section describes the size of MMA opS
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
32
>
;
// TensorCore instruction shape
// This code section describes how threadblocks are scheduled on GPU
using
SwizzleThreadBlock
=
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<
4
>
;
// 1 -> singlestage
// 2 -> pipelined
constexpr
int
NumStages
=
1
;
// This code section describes the epilogue part of the kernel, we use default value
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombinationRelu
<
ElementOutput
,
// Data type of output matrix.
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
// The number of elements per vectorized.
// memory access. This becomes the vector width of
// math instructions in the epilogue too.
ElementAccumulator
,
// Data type of accumulator
ElementComputeEpilogue
,
// Data type for alpha/beta in linear combination
hytlass
::
epilogue
::
thread
::
ScaleType
::
NoBetaScaling
>
;
// EpilogueOp::Params params(ElementComputeEpilogue(0.5), ElementComputeEpilogue(0));
using
Conv2dFpropKernel
=
typename
hytlass
::
conv
::
kernel
::
DefaultConv2dFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
conv
::
IteratorAlgorithm
::
kOptimized
,
hytlass
::
conv
::
StrideSupport
::
kStrided
,
8
,
8
>::
Kernel
;
using
ImplicitGemm
=
hytlass
::
conv
::
device
::
ImplicitGemmConvolution
<
Conv2dFpropKernel
>
;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
Tensor4DCoord
input_size
;
hytlass
::
Tensor4DCoord
filter_size
;
hytlass
::
Tensor4DCoord
padding
;
hytlass
::
MatrixCoord
conv_stride
;
hytlass
::
MatrixCoord
dilation
;
bool
reference_check
;
bool
measure_performance
;
int
iterations
;
bool
save_workspace
;
ElementComputeEpilogue
alpha
;
ElementComputeEpilogue
beta
;
bool
benchmark
;
std
::
string
tag
;
Options
()
:
help
(
false
),
input_size
(
1
,
14
,
14
,
1024
),
filter_size
(
256
,
1
,
1
,
1024
),
padding
(
0
,
0
,
0
,
0
),
conv_stride
(
1
,
1
),
dilation
(
1
,
1
),
reference_check
(
true
),
measure_performance
(
true
),
iterations
(
20
),
save_workspace
(
false
),
alpha
(
1
),
beta
(
1
),
benchmark
(
false
)
{
}
// Verify the problem size is compatible with the HYTLASS Convolution implementation.
bool
valid
()
{
//
// HYTLASS attempts to load 128b vectors. Consequently,
// all pointers, strides, and tensor extents must be divisible by 32 elements.
//
// int const kAlignment = 128 / hytlass::sizeof_bits<ElementInputA>::value;
// if ((input_size.c() % kAlignment) ||
// (filter_size.n() % kAlignment)) {
// // misaligned tensors
// return false;
// }
// Invalid padding
if
((
padding
.
h
()
!=
filter_size
.
h
()
/
2
)
||
(
padding
.
w
()
!=
filter_size
.
w
()
/
2
))
{
return
false
;
}
return
true
;
}
/// Updates input and filter sizes
void
update
(
hytlass
::
Tensor4DCoord
input_size
,
hytlass
::
Tensor4DCoord
filter_size
)
{
this
->
input_size
=
input_size
;
this
->
filter_size
=
filter_size
;
padding
.
n
()
=
filter_size
.
h
()
/
2
;
padding
.
h
()
=
filter_size
.
h
()
/
2
;
padding
.
w
()
=
filter_size
.
w
()
/
2
;
padding
.
c
()
=
filter_size
.
w
()
/
2
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"ref-check"
))
{
reference_check
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"perf-check"
))
{
measure_performance
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"save-workspace"
))
{
save_workspace
=
true
;
}
if
(
cmd
.
check_cmd_line_flag
(
"benchmark"
))
{
benchmark
=
false
;
}
cmd
.
get_cmd_line_argument
(
"n"
,
input_size
.
n
());
cmd
.
get_cmd_line_argument
(
"h"
,
input_size
.
h
());
cmd
.
get_cmd_line_argument
(
"w"
,
input_size
.
w
());
cmd
.
get_cmd_line_argument
(
"c"
,
input_size
.
c
());
cmd
.
get_cmd_line_argument
(
"k"
,
filter_size
.
n
());
cmd
.
get_cmd_line_argument
(
"r"
,
filter_size
.
h
());
cmd
.
get_cmd_line_argument
(
"s"
,
filter_size
.
w
());
filter_size
.
c
()
=
input_size
.
c
();
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
cmd
.
get_cmd_line_argument
(
"tag"
,
tag
);
int32_t
padding_h
=
filter_size
.
h
()
/
2
;
int32_t
padding_w
=
filter_size
.
w
()
/
2
;
padding
=
{
padding_h
,
padding_h
,
padding_w
,
padding_w
};
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"13_tensorop_fused_conv2d_fprop example
\n\n
"
<<
" This example uses Turing's Tensor Core operators on int4 data types to compute
\n
"
<<
" forward convolution on tensors of layout NHWC.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --n=<int> Input tensor extent N
\n
"
<<
" --h=<int> Input tensor extent H
\n
"
<<
" --w=<int> Input tensor extent W
\n
"
<<
" --c=<int> Input tensor extent C
\n
"
<<
" --k=<int> Filter extent K
\n
"
<<
" --r=<int> Filter extent R
\n
"
<<
" --s=<int> Filter extent S
\n\n
"
<<
" --alpha=<float> Epilogue scalar alpha
\n
"
<<
" --beta=<float> Epilogue scalar beta
\n\n
"
<<
" --ref-check If set (true), reference check on the host is computed
\n
"
<<
" --perf-check If set (true), performance is measured.
\n
"
<<
" --benchmark If set (true), performance benchmarking on several layers and batch-size.
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n
"
<<
" --save-workspace If set, workspace is written to a text file.
\n
"
<<
" --tag=<string> String to replicate across the first column in the results table
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./gfx928_tensorop_conv2d_bias_relu --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1
\n\n
"
<<
"$ ./gfx928_tensorop_conv2d_bias_relu --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check
\n\n
"
;
return
out
;
}
/// Computes the output tensor size (NPQK)
hytlass
::
Tensor4DCoord
output_size
()
const
{
return
hytlass
::
Tensor4DCoord
(
input_size
.
n
(),
(
input_size
.
h
()
+
padding
.
n
()
+
padding
.
h
()
-
filter_size
.
h
())
/
conv_stride
.
row
()
+
1
,
(
input_size
.
w
()
+
padding
.
w
()
+
padding
.
c
()
-
filter_size
.
w
())
/
conv_stride
.
column
()
+
1
,
filter_size
.
n
());
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of multiply-adds = NPQK * CRS
int64_t
fmas
=
output_size
().
product
()
*
int64_t
(
filter_size
.
h
()
*
filter_size
.
w
()
*
filter_size
.
c
());
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hytlass
::
Status
reference_check
;
hipError_t
error
;
Result
()
:
runtime_ms
(
0
),
gflops
(
0
),
status
(
hytlass
::
Status
::
kSuccess
),
reference_check
(
hytlass
::
Status
::
kInvalid
),
error
(
hipSuccess
)
{
}
static
std
::
ostream
&
print_header
(
std
::
ostream
&
out
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
"Name,"
;
}
out
<<
"Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"
;
return
out
;
}
std
::
ostream
&
print
(
std
::
ostream
&
out
,
int
idx
,
Options
const
&
options
)
{
if
(
!
options
.
tag
.
empty
())
{
out
<<
options
.
tag
<<
","
;
}
out
<<
"conv_"
<<
idx
<<
","
<<
options
.
input_size
.
n
()
<<
","
<<
options
.
input_size
.
h
()
<<
","
<<
options
.
input_size
.
w
()
<<
","
<<
options
.
input_size
.
c
()
<<
","
<<
options
.
filter_size
.
n
()
<<
","
<<
options
.
filter_size
.
h
()
<<
","
<<
options
.
filter_size
.
w
()
<<
","
<<
runtime_ms
<<
","
<<
gflops
;
return
out
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Runs one benchmark
Result
profile_convolution
(
Options
const
&
options
)
{
Result
result
;
//
// Allocate host-device tensors using the HYTLASS Utilities.
//
hytlass
::
HostTensor
<
ElementInputA
,
LayoutInputA
>
tensor_a
(
options
.
input_size
);
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b
(
options
.
filter_size
);
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_c_bias
({
1
,
1
,
1
,
options
.
output_size
().
c
()});
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d
(
options
.
output_size
());
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_ref_d
(
options
.
output_size
());
//
// Initialize tensors
//
// Fill tensor A on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_a
.
host_view
(),
1
,
ElementInputA
(
4
),
ElementInputA
(
-
4
),
hytlass
::
MantissaInBits
<
ElementOutput
>::
bits
);
// Fill tensor B on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_b
.
host_view
(),
2
,
ElementInputB
(
4
),
ElementInputB
(
-
4
),
hytlass
::
MantissaInBits
<
ElementOutput
>::
bits
);
// <- Fill matrix C on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_c_bias
.
host_view
(),
1
,
ElementOutput
(
1
),
ElementOutput
(
-
1
),
0
);
// Fill tensor D on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d
.
host_view
());
// Fill tensor D for reference on host with zeros
hytlass
::
reference
::
host
::
TensorFill
(
tensor_ref_d
.
host_view
());
// Copy data from host to GPU
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_c_bias
.
sync_device
();
tensor_d
.
sync_device
();
tensor_ref_d
.
sync_device
();
//
// Define arguments for HYTLASS Convolution
//
// mode (kCrossCorrelation or kConvolution)
hytlass
::
conv
::
Mode
mode
=
hytlass
::
conv
::
Mode
::
kCrossCorrelation
;
// Split K dimension into 1 partitions
int
split_k_slices
=
1
;
ElementComputeEpilogue
alpha
=
ElementComputeEpilogue
(
1
);
ElementComputeEpilogue
beta
=
ElementComputeEpilogue
(
0
);
// Construct Conv2dProblemSize with user defined output size
hytlass
::
conv
::
Conv2dProblemSize
problem_size
(
options
.
input_size
,
options
.
filter_size
,
options
.
padding
,
options
.
conv_stride
,
options
.
dilation
,
options
.
output_size
(),
mode
,
split_k_slices
);
// Construct ImplicitGemm::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename
ImplicitGemm
::
Arguments
arguments
{
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
{
tensor_c_bias
.
device_data
(),
LayoutOutput
::
Stride
(
0
)},
tensor_d
.
device_ref
(),
{
alpha
,
beta
},
};
//
// Initialize HYTLASS Convolution
//
ImplicitGemm
implicit_gemm_op
;
size_t
workspace_size
=
implicit_gemm_op
.
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
result
.
status
=
implicit_gemm_op
.
can_implement
(
arguments
);
HYTLASS_CHECK
(
result
.
status
);
result
.
status
=
implicit_gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
HYTLASS_CHECK
(
result
.
status
);
//
// Launch initialized HYTLASS kernel
//
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
//
// Optional reference check
//
if
(
options
.
reference_check
)
{
std
::
cout
<<
"Verification on host...
\n
"
;
// Compute with reference implementation
hytlass
::
reference
::
device
::
Conv2dFprop
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementComputeEpilogue
,
ElementAccumulator
,
hytlass
::
NumericConverter
<
ElementOutput
,
ElementComputeEpilogue
>
>
(
problem_size
,
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_c_bias
.
device_ref
(),
tensor_ref_d
.
device_ref
(),
alpha
,
beta
);
// Check if output from HYTLASS kernel and reference kernel are equal or not
tensor_d
.
sync_host
();
tensor_ref_d
.
sync_host
();
// Compute bias + relu in host code
for
(
int
n
=
0
;
n
<
problem_size
.
N
;
++
n
)
{
for
(
int
p
=
0
;
p
<
problem_size
.
P
;
++
p
)
{
for
(
int
q
=
0
;
q
<
problem_size
.
Q
;
++
q
)
{
for
(
int
k
=
0
;
k
<
problem_size
.
K
;
++
k
)
{
tensor_ref_d
.
at
({
n
,
p
,
q
,
k
})
=
std
::
max
(
ElementOutput
(
0
),
ElementOutput
(
tensor_ref_d
.
at
({
n
,
p
,
q
,
k
})
+
tensor_c_bias
.
at
({
0
,
0
,
0
,
k
})));
}
}
}
}
const
ElementOutput
non_zero_floor
(
1e-6
);
ElementOutput
eps
(
1e-3
);
bool
passed
=
hytlass
::
reference
::
host
::
TensorRelativelyEquals
(
tensor_d
.
host_view
(),
tensor_ref_d
.
host_view
(),
eps
,
non_zero_floor
);
if
(
!
passed
)
{
result
.
reference_check
=
hytlass
::
Status
::
kErrorInternal
;
std
::
cout
<<
"ERROR - results miscompared.
\n
"
;
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kSuccess
;
std
::
cout
<<
"Passed.
\n
"
;
}
}
else
{
result
.
reference_check
=
hytlass
::
Status
::
kInvalid
;
}
if
(
options
.
save_workspace
)
{
std
::
stringstream
ss
;
ss
<<
"09_tensor_conv_workspace_conv2dfprop_"
<<
options
.
input_size
.
n
()
<<
"x"
<<
options
.
input_size
.
h
()
<<
"x"
<<
options
.
input_size
.
w
()
<<
"x"
<<
options
.
input_size
.
c
()
<<
"_"
<<
options
.
filter_size
.
n
()
<<
"x"
<<
options
.
filter_size
.
h
()
<<
"x"
<<
options
.
filter_size
.
w
()
<<
"x"
<<
options
.
filter_size
.
c
()
<<
".dat"
;
std
::
ofstream
output_workspace
(
ss
.
str
());
output_workspace
<<
"Input =
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n\n
"
<<
"Filters =
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n\n
"
;
if
(
options
.
reference_check
)
{
output_workspace
<<
"Reference =
\n
"
<<
tensor_ref_d
.
host_view
()
<<
"
\n\n
"
;
}
output_workspace
<<
"Computed =
\n
"
<<
tensor_d
.
host_view
()
<<
std
::
endl
;
std
::
cout
<<
"Results written to '"
<<
ss
.
str
()
<<
"'."
<<
std
::
endl
;
}
//
// Performance measurement
//
if
(
options
.
measure_performance
)
{
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
}
// Record an event at the start of a series of convolution operations.
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Launch a sequence of implicit GEMM operations on the device
for
(
int
iteration
=
0
;
iteration
<
options
.
iterations
;
++
iteration
)
{
result
.
status
=
implicit_gemm_op
();
HYTLASS_CHECK
(
result
.
status
);
}
// Record an event when the convolutions have been launched.
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Print average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
}
return
result
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
if
(
options
.
benchmark
)
{
// Benchmark several layers
int
batch_sizes
[]
=
{
1
,
32
,
64
,
128
,
256
,
512
};
struct
Benchmark
{
int
h
,
w
,
c
,
k
,
r
,
s
;
}
layers
[]
=
{
{
56
,
56
,
64
,
256
,
1
,
1
},
{
56
,
56
,
64
,
64
,
1
,
1
},
{
56
,
56
,
64
,
64
,
3
,
3
},
{
56
,
56
,
256
,
64
,
1
,
1
},
{
56
,
56
,
256
,
512
,
1
,
1
},
{
56
,
56
,
256
,
128
,
1
,
1
},
{
28
,
28
,
128
,
128
,
3
,
3
},
{
28
,
28
,
128
,
512
,
1
,
1
},
{
28
,
28
,
512
,
128
,
1
,
1
},
{
28
,
28
,
512
,
1024
,
1
,
1
},
{
28
,
28
,
512
,
256
,
1
,
1
},
{
14
,
14
,
256
,
256
,
3
,
3
},
{
14
,
14
,
256
,
1024
,
1
,
1
},
{
14
,
14
,
1024
,
256
,
1
,
1
},
{
14
,
14
,
1024
,
2048
,
1
,
1
},
{
14
,
14
,
1024
,
512
,
1
,
1
},
{
7
,
7
,
512
,
512
,
3
,
3
},
};
Result
::
print_header
(
std
::
cout
,
options
)
<<
std
::
endl
;
int
idx
=
1
;
for
(
auto
const
&
layer
:
layers
)
{
for
(
auto
N
:
batch_sizes
)
{
options
.
update
({
N
,
layer
.
h
,
layer
.
w
,
layer
.
c
},
{
layer
.
k
,
layer
.
r
,
layer
.
s
,
layer
.
c
});
Result
result
=
profile_convolution
(
options
);
result
.
print
(
std
::
cout
,
idx
,
options
)
<<
std
::
endl
;
}
++
idx
;
}
}
else
{
// Execute one problem size
if
(
!
options
.
valid
())
{
std
::
cerr
<<
"Invalid problem."
<<
std
::
endl
;
return
-
1
;
}
Result
result
=
profile_convolution
(
options
);
Result
::
print_header
(
std
::
cout
,
options
)
<<
std
::
endl
;
result
.
print
(
std
::
cout
,
1
,
options
)
<<
std
::
endl
;
}
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/14_gather_scatter_fusion/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
gather_k_fusion
gather_k_fusion.cu
)
hytlass_example_add_executable
(
gather_scatter_fusion
gather_scatter_fusion.cu
)
\ No newline at end of file
examples/14_gather_scatter_fusion/gather_k_fusion.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
// This example fuses gather before GEMM and scatter after GEMM into the same
// GEMM kernel. Gather and scatter operation is controled by an index vector
// to select rows or columns from A, B, C or D matrices.
//
// Suppose, all matrices are column major. The pseudo code of the fused kernel
// in this example is essentially
//
// for (int i = 0; i < problem_size.m(); ++i) {
// for (int j = 0; j < options.index_size; ++j) {
// int b_c_d_col = tensor_indices.at({j, 0});
//
// for (int k = 0; k < options.index_size; ++k) {
// tensor_d_ref.at({i, b_c_d_col}) +=
// alpha * tensor_a.at({i, k}) * tensor_b.at({k, b_c_d_col});
// }
// }
//
// Note that the index vector contains unique random integers with max to be N - 1
//
// The gather/scatter operation works best when we can still keep the biggest
// alignment. For example, when the matrix is row major, we select rows. When
// the matrix is column major, we select columns.
//
// Not all the combination of gather and scatter are legal. For example, if A is
// row major and C/D is column major, we cannot gather A and scatter C/D at the
// same time.
//
// Also, we don't check the index value is legal and index array point is valid
// for the sake of the performance.
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <hip/hip_runtime.h>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <random>
#include <numeric>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm_universal.h"
#include "hytlass/epilogue/thread/linear_combination.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hipError_t
error
;
bool
passed
;
//
// Methods
//
Result
(
double
runtime_ms
=
0
,
double
gflops
=
0
,
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
,
hipError_t
error
=
hipSuccess
)
:
runtime_ms
(
runtime_ms
),
gflops
(
gflops
),
status
(
status
),
error
(
error
),
passed
(
true
)
{}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
gemm
::
GemmCoord
problem_size
;
int
index_size
;
bool
reference_check
;
int
iterations
;
Options
()
:
help
(
false
),
problem_size
({
1024
,
1024
,
1024
}),
index_size
(
240
),
reference_check
(
true
),
iterations
(
50
)
{}
bool
valid
()
{
return
true
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
cmd
.
get_cmd_line_argument
(
"m"
,
problem_size
.
m
());
cmd
.
get_cmd_line_argument
(
"n"
,
problem_size
.
n
());
cmd
.
get_cmd_line_argument
(
"k"
,
problem_size
.
k
());
cmd
.
get_cmd_line_argument
(
"index_size"
,
index_size
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"14_gather_scatter_fusion example
\n\n
"
<<
" This example uses the HYTLASS Library to fuse gather/scatter into GEMM
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --m=<int> GEMM M dimension
\n
"
<<
" --n=<int> GEMM N dimension
\n
"
<<
" --k=<int> GEMM K dimension
\n
"
<<
" --index_size=<int> size of N dimension index
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./examples/14_gather_scatter_fusion/gather_k_fusion --m=1024 --n=512 --k=1024
\\\n
"
<<
" --index_size=128
\n\n
"
;
return
out
;
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of real-valued multiply-adds
int64_t
fmas
=
problem_size
.
m
()
*
problem_size
.
n
()
*
int64_t
(
index_size
);
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using
ElementAccumulator
=
float
;
// <- data type of accumulator
using
ElementComputeEpilogue
=
ElementAccumulator
;
// <- data type of epilogue operations
using
ElementInputA
=
hytlass
::
half_t
;
// <- data type of elements in input matrix A
using
ElementInputB
=
hytlass
::
half_t
;
// <- data type of elements in input matrix B
using
ElementOutput
=
float
;
// <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices.
// Column Major for Matrix A, B and C.
//
using
LayoutInputA
=
hytlass
::
layout
::
ColumnMajor
;
using
LayoutInputB
=
hytlass
::
layout
::
RowMajor
;
using
LayoutOutput
=
hytlass
::
layout
::
RowMajor
;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using
MMAOp
=
hytlass
::
arch
::
OpClassTensorOp
;
// This code section describes GFX architecture number
using
SmArch
=
hytlass
::
arch
::
Gfx928
;
// This code section describes the tile size a thread block will compute
using
ShapeMMAThreadBlock
=
hytlass
::
gemm
::
GemmShape
<
128
,
256
,
32
>
;
// This code section describes tile size a warp will compute
using
ShapeMMAWarp
=
hytlass
::
gemm
::
GemmShape
<
64
,
128
,
32
>
;
// This code section describes the size of MMA op
using
ShapeMMAOp
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
;
// This code section describes how threadblocks are scheduled on GPU
using
SwizzleThreadBlock
=
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<>
;
// <- ??
// Define the epilogue operation as LinearCombination. This is approximately equal to
//
// d_ij = alpha * sum_k(a_ik * b_kj) + c_ij
//
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombination
<
ElementOutput
,
// <- data type of output matrix
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
// <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator
,
// <- data type of accumulator
ElementComputeEpilogue
>
;
// <- data type for alpha in linear combination function
// Number of pipelines you want to use
constexpr
int
NumStages
=
1
;
using
Gemm
=
hytlass
::
gemm
::
device
::
GemmUniversal
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ShapeMMAThreadBlock
,
ShapeMMAWarp
,
ShapeMMAOp
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
8
,
/*alignmentA*/
8
,
/*alignmentB*/
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
ComplexTransform
::
kNone
,
hytlass
::
ComplexTransform
::
kNone
,
true
,
/*GatherA*/
true
,
/*GatherB*/
false
,
/*ScatterD*/
hytlass
::
layout
::
NoPermute
,
hytlass
::
layout
::
NoPermute
,
hytlass
::
layout
::
NoPermute
,
true
/*buffer load*/
>
;
int
run
(
Options
&
options
)
{
// ================================================================================
// Initialization setup
// Create a tuple of problem size for matrix multiplication
hytlass
::
gemm
::
GemmCoord
problem_size
=
options
.
problem_size
;
// Create a tuple of problem size for matrix multiplication
hytlass
::
gemm
::
GemmCoord
problem_size_real
(
problem_size
.
m
(),
problem_size
.
n
(),
options
.
index_size
);
// Initialize tensors using HYTLASS helper functions
hytlass
::
HostTensor
<
ElementInputA
,
LayoutInputA
>
tensor_a
(
problem_size
.
mk
());
// <- Create matrix A with dimensions M x K
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b
(
problem_size
.
kn
());
// <- Create matrix B with dimensions K x N
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_c
(
problem_size
.
mn
());
// <- Create matrix C with dimensions M x N
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d
(
problem_size
.
mn
());
// <- Create matrix D with dimensions M x N used to store output from
// HYTLASS kernel
// Fill input and output matrices on host using HYTLASS helper functions
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_a
.
host_view
(),
1
,
ElementInputA
(
3
),
ElementInputA
(
-
3
),
0
);
// <- Fill matrix A on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_b
.
host_view
(),
1
,
ElementInputA
(
3
),
ElementInputA
(
-
3
),
0
);
// <- Fill matrix B on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_c
.
host_view
(),
1
,
ElementOutput
(
7
),
ElementOutput
(
-
8
),
0
);
// <- Fill matrix C on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d
.
host_view
());
// <- fill matrix D on host with zeros
hytlass
::
HostTensor
<
int
,
LayoutOutput
>
tensor_indices
(
{
options
.
index_size
,
1
});
// <- Create scatter indices with dimensions val_len x 1
// <- Fill tensor_k_indices on host with unique random integers
std
::
vector
<
int
>
to_fill
(
problem_size
.
k
());
// vector with ints.
std
::
iota
(
std
::
begin
(
to_fill
),
std
::
end
(
to_fill
),
0
);
// Fill with 0, 1, ...., problem_size.n()
{
// std::random_shuffle was deprecated in C++14 and removed in C++17
std
::
random_device
make_seed
;
std
::
mt19937
source_of_randomness
(
make_seed
());
std
::
shuffle
(
to_fill
.
begin
(),
to_fill
.
end
(),
source_of_randomness
);
}
memcpy
(
tensor_indices
.
host_data
(),
to_fill
.
data
(),
options
.
index_size
*
sizeof
(
int
));
// Copy data from host to GPU
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_indices
.
sync_device
();
tensor_c
.
sync_device
();
tensor_d
.
sync_device
();
// Initialize alpha/beta for dot product computation
ElementComputeEpilogue
alpha
=
ElementComputeEpilogue
(
1
);
ElementComputeEpilogue
beta
=
ElementComputeEpilogue
(
1
);
// Split K dimension into 1 partitions
int
split_k_slices
=
1
;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated HYTLASS kernel
typename
Gemm
::
Arguments
arguments
{
hytlass
::
gemm
::
GemmUniversalMode
::
kGemm
,
problem_size_real
,
// <- problem size of matrix multiplication
split_k_slices
,
// <- k-dimension split factor
{
alpha
,
beta
},
// <- alpha, beta
tensor_a
.
device_data
(),
// <- reference to matrix A on device
tensor_b
.
device_data
(),
// <- reference to matrix B on device
tensor_c
.
device_data
(),
// <- reference to matrix C on device
tensor_d
.
device_data
(),
// <- reference to matrix D on device
tensor_a
.
layout
().
capacity
(
hytlass
::
make_Coord
(
problem_size
.
m
(),
options
.
index_size
)),
tensor_b
.
layout
().
capacity
(
hytlass
::
make_Coord
(
options
.
index_size
,
problem_size
.
n
())),
tensor_c
.
layout
().
capacity
(
problem_size
.
mn
()),
tensor_d
.
layout
().
capacity
(
problem_size
.
mn
()),
tensor_a
.
layout
().
stride
(),
tensor_b
.
layout
().
stride
(),
tensor_c
.
layout
().
stride
(),
tensor_d
.
layout
().
stride
(),
tensor_indices
.
device_data
(),
// <- pointer to index vector to gather A on device
tensor_indices
.
device_data
(),
// <- pointer to index vector to gather B on device
nullptr
};
// <- pointer to index vector to scatter D on device
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t
workspace_size
=
Gemm
::
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
// Instantiate HYTLASS kernel depending on templates
Gemm
gemm_op
;
// Check the problem size is supported or not
hytlass
::
Status
status
=
gemm_op
.
can_implement
(
arguments
);
HYTLASS_CHECK
(
status
);
// Initialize HYTLASS kernel with arguments and workspace pointer
status
=
gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
HYTLASS_CHECK
(
status
);
// CPU reference calculation
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d_ref
(
problem_size
.
mn
());
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d_ref
.
host_view
());
// <- Fill matrix D on host with zeros
status
=
gemm_op
();
(
void
)
hipDeviceSynchronize
();
HYTLASS_CHECK
(
status
);
if
(
options
.
reference_check
)
{
for
(
int
i
=
0
;
i
<
problem_size
.
m
();
++
i
)
{
for
(
int
j
=
0
;
j
<
problem_size
.
n
();
++
j
)
{
for
(
int
k
=
0
;
k
<
options
.
index_size
;
++
k
)
{
int
_k
=
tensor_indices
.
at
({
k
,
0
});
tensor_d_ref
.
at
({
i
,
j
})
+=
(
alpha
*
tensor_a
.
at
({
i
,
_k
})
*
tensor_b
.
at
({
_k
,
j
}));
}
tensor_d_ref
.
at
({
i
,
j
})
+=
(
beta
*
tensor_c
.
at
({
i
,
j
}));
}
}
// Copy output data from HYTLASS and reference kernel to host for comparison
tensor_d
.
sync_host
();
bool
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
tensor_d
.
host_view
(),
tensor_d_ref
.
host_view
());
if
(
!
passed
)
{
std
::
cout
<<
"Failed!
\n
"
;
std
::
stringstream
fname
;
fname
<<
"error_gather_k_GEMM_fusion.txt"
;
std
::
cerr
<<
"Dumping results in "
<<
fname
.
str
()
<<
"
\n
"
;
std
::
ofstream
file
(
fname
.
str
());
file
<<
"A =
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n
B =
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n
indices =
\n
"
<<
tensor_indices
.
host_view
()
<<
"
\n
C =
\n
"
<<
tensor_c
.
host_view
()
<<
"
\n\n
Reference =
\n
"
<<
tensor_d_ref
.
host_view
()
<<
"
\n
Computed =
\n
"
<<
tensor_d
.
host_view
();
return
-
1
;
}
else
{
std
::
cout
<<
"Passed!
\n
"
;
}
}
// Result structure
Result
result
;
//
// Construct events
//
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
}
// warm up
for
(
int
iter
=
0
;
iter
<
10
;
++
iter
)
{
status
=
gemm_op
();
HYTLASS_CHECK
(
status
);
}
// Record an event at the start of a series of GEMMs
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
//
// Run profiling loop
//
for
(
int
iter
=
0
;
iter
<
options
.
iterations
;
++
iter
)
{
// Launch initialized HYTLASS kernel
status
=
gemm_op
();
HYTLASS_CHECK
(
status
);
}
//
// Stop profiling loop
//
// Record an event when the GEMMs are complete
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
// Compute average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
std
::
cout
<<
"Runtime: "
<<
result
.
runtime_ms
<<
" ms
\n
"
;
std
::
cout
<<
"GFLOPs: "
<<
result
.
gflops
<<
"
\n
"
;
return
0
;
}
int
main
(
int
argc
,
const
char
**
argv
)
{
Options
options
;
options
.
parse
(
argc
,
argv
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
"
\n
"
;
return
0
;
}
if
(
!
options
.
valid
())
{
std
::
cerr
<<
"Invalid problem."
<<
"
\n
"
;
return
-
1
;
}
return
run
(
options
);
}
examples/14_gather_scatter_fusion/gather_scatter_fusion.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
// This example fuses gather before GEMM and scatter after GEMM into the same
// GEMM kernel. Gather and scatter operation is controled by an index vector
// to select rows or columns from A, B, C or D matrices.
//
// Suppose, all matrices are column major. The pseudo code of the fused kernel
// in this example is essentially
//
// for (int i = 0; i < problem_size.m(); ++i) {
// for (int j = 0; j < options.index_size; ++j) {
// int b_c_d_col = tensor_indices.at({j, 0});
//
// for (int k = 0; k < options.index_size; ++k) {
// tensor_d_ref.at({i, b_c_d_col}) +=
// alpha * tensor_a.at({i, k}) * tensor_b.at({k, b_c_d_col});
// }
// }
//
// Note that the index vector contains unique random integers with max to be N - 1
//
// The gather/scatter operation works best when we can still keep the biggest
// alignment. For example, when the matrix is row major, we select rows. When
// the matrix is column major, we select columns.
//
// Not all the combination of gather and scatter are legal. For example, if A is
// row major and C/D is column major, we cannot gather A and scatter C/D at the
// same time.
//
// Also, we don't check the index value is legal and index array point is valid
// for the sake of the performance.
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <hip/hip_runtime.h>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <random>
#include <numeric>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/device/gemm_universal.h"
#include "hytlass/epilogue/thread/linear_combination.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/tensor_view_io.h"
#include "helper.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hipError_t
error
;
bool
passed
;
//
// Methods
//
Result
(
double
runtime_ms
=
0
,
double
gflops
=
0
,
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
,
hipError_t
error
=
hipSuccess
)
:
runtime_ms
(
runtime_ms
),
gflops
(
gflops
),
status
(
status
),
error
(
error
),
passed
(
true
)
{}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
gemm
::
GemmCoord
problem_size
;
int
index_size
;
bool
reference_check
;
int
iterations
;
Options
()
:
help
(
false
),
problem_size
({
248
,
1024
,
1024
}),
index_size
(
240
),
reference_check
(
true
),
iterations
(
50
)
{}
bool
valid
()
{
return
true
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
cmd
.
get_cmd_line_argument
(
"m"
,
problem_size
.
m
());
cmd
.
get_cmd_line_argument
(
"n"
,
problem_size
.
n
());
cmd
.
get_cmd_line_argument
(
"k"
,
problem_size
.
k
());
cmd
.
get_cmd_line_argument
(
"index_size"
,
index_size
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"14_gather_scatter_fusion example
\n\n
"
<<
" This example uses the HYTLASS Library to fuse gather/scatter into GEMM
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --m=<int> GEMM M dimension
\n
"
<<
" --n=<int> GEMM N dimension
\n
"
<<
" --k=<int> GEMM K dimension
\n
"
<<
" --index_size=<int> size of N dimension index
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./examples/14_gather_scatter_fusion/gather_scatter_fusion --m=1024 --n=512 --k=1024
\\\n
"
<<
" --index_size=128
\n\n
"
;
return
out
;
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of real-valued multiply-adds
int64_t
fmas
=
problem_size
.
m
()
*
int64_t
(
index_size
)
*
problem_size
.
k
();
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using
ElementAccumulator
=
float
;
// <- data type of accumulator
using
ElementComputeEpilogue
=
ElementAccumulator
;
// <- data type of epilogue operations
using
ElementInputA
=
hytlass
::
half_t
;
// <- data type of elements in input matrix A
using
ElementInputB
=
hytlass
::
half_t
;
// <- data type of elements in input matrix B
using
ElementOutput
=
float
;
// <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices.
// Column Major for Matrix A, B and C.
//
using
LayoutInputA
=
hytlass
::
layout
::
ColumnMajor
;
using
LayoutInputB
=
hytlass
::
layout
::
ColumnMajor
;
using
LayoutOutput
=
hytlass
::
layout
::
ColumnMajor
;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using
MMAOp
=
hytlass
::
arch
::
OpClassTensorOp
;
// This code section describes GFX architecture number
using
SmArch
=
hytlass
::
arch
::
Gfx928
;
// This code section describes the tile size a thread block will compute
using
ShapeMMAThreadBlock
=
hytlass
::
gemm
::
GemmShape
<
128
,
256
,
32
>
;
// This code section describes tile size a warp will compute
using
ShapeMMAWarp
=
hytlass
::
gemm
::
GemmShape
<
64
,
128
,
32
>
;
// This code section describes the size of MMA op
using
ShapeMMAOp
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
;
// This code section describes how threadblocks are scheduled on GPU
using
SwizzleThreadBlock
=
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<>
;
// <- ??
// Define the epilogue operation as LinearCombination. This is approximately equal to
//
// d_ij = alpha * sum_k(a_ik * b_kj) + c_ij
//
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombination
<
ElementOutput
,
// <- data type of output matrix
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
// <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator
,
// <- data type of accumulator
ElementComputeEpilogue
>
;
// <- data type for alpha in linear combination function
// Number of pipelines you want to use
constexpr
int
NumStages
=
1
;
using
Gemm
=
hytlass
::
gemm
::
device
::
GemmUniversal
<
ElementInputA
,
LayoutInputA
,
ElementInputB
,
LayoutInputB
,
ElementOutput
,
LayoutOutput
,
ElementAccumulator
,
MMAOp
,
SmArch
,
ShapeMMAThreadBlock
,
ShapeMMAWarp
,
ShapeMMAOp
,
EpilogueOp
,
SwizzleThreadBlock
,
NumStages
,
8
,
/*alignmentA*/
8
,
/*alignmentB*/
hytlass
::
arch
::
OpMultiplyAdd
,
hytlass
::
ComplexTransform
::
kNone
,
hytlass
::
ComplexTransform
::
kNone
,
false
,
/*GatherA*/
true
,
/*GatherB*/
true
,
/*ScatterD*/
hytlass
::
layout
::
NoPermute
,
hytlass
::
layout
::
NoPermute
,
hytlass
::
layout
::
NoPermute
,
true
>
;
int
run
(
Options
&
options
)
{
// ================================================================================
// Initialization setup
// Create a tuple of problem size for matrix multiplication
hytlass
::
gemm
::
GemmCoord
problem_size
=
options
.
problem_size
;
// Create a tuple of problem size for matrix multiplication
hytlass
::
gemm
::
GemmCoord
problem_size_real
(
problem_size
.
m
(),
options
.
index_size
,
problem_size
.
k
());
// Initialize tensors using HYTLASS helper functions
hytlass
::
HostTensor
<
ElementInputA
,
LayoutInputA
>
tensor_a
(
problem_size
.
mk
());
// <- Create matrix A with dimensions M x K
hytlass
::
HostTensor
<
ElementInputB
,
LayoutInputB
>
tensor_b
(
problem_size
.
kn
());
// <- Create matrix B with dimensions K x N
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_c
(
problem_size
.
mn
());
// <- Create matrix C with dimensions M x N
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d_scattered
(
problem_size
.
mn
());
// <- Create matrix D with dimensions M x N used to store output from
// HYTLASS kernel
// Fill input and output matrices on host using HYTLASS helper functions
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_a
.
host_view
(),
1
,
ElementInputA
(
3
),
ElementInputA
(
-
3
),
0
);
// <- Fill matrix A on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_b
.
host_view
(),
1
,
ElementInputA
(
3
),
ElementInputA
(
-
3
),
0
);
// <- Fill matrix B on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
tensor_c
.
host_view
(),
1
,
ElementOutput
(
7
),
ElementOutput
(
-
8
),
0
);
// <- Fill matrix C on host with uniform-distribution random data
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d_scattered
.
host_view
());
// <- fill matrix D on host with zeros
hytlass
::
HostTensor
<
int
,
LayoutOutput
>
tensor_indices
(
{
options
.
index_size
,
1
});
// <- Create scatter indices with dimensions val_len x 1
// <- Fill tensor_b_indices on host with unique random integers
std
::
vector
<
int
>
to_fill
(
problem_size
.
n
());
// vector with ints.
std
::
iota
(
std
::
begin
(
to_fill
),
std
::
end
(
to_fill
),
0
);
// Fill with 0, 1, ...., problem_size.n()
{
// std::random_shuffle was deprecated in C++14 and removed in C++17
std
::
random_device
make_seed
;
std
::
mt19937
source_of_randomness
(
make_seed
());
std
::
shuffle
(
to_fill
.
begin
(),
to_fill
.
end
(),
source_of_randomness
);
}
memcpy
(
tensor_indices
.
host_data
(),
to_fill
.
data
(),
options
.
index_size
*
sizeof
(
int
));
// Copy data from host to GPU
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_indices
.
sync_device
();
tensor_c
.
sync_device
();
tensor_d_scattered
.
sync_device
();
// Initialize alpha/beta for dot product computation
ElementComputeEpilogue
alpha
=
ElementComputeEpilogue
(
1
);
ElementComputeEpilogue
beta
=
ElementComputeEpilogue
(
1
);
// Split K dimension into 1 partitions
int
split_k_slices
=
1
;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated HYTLASS kernel
typename
Gemm
::
Arguments
arguments
{
hytlass
::
gemm
::
GemmUniversalMode
::
kGemm
,
problem_size_real
,
// <- problem size of matrix multiplication
split_k_slices
,
// <- k-dimension split factor
{
alpha
,
beta
},
// <- alpha, beta
tensor_a
.
device_data
(),
// <- reference to matrix A on device
tensor_b
.
device_data
(),
// <- reference to matrix B on device
tensor_c
.
device_data
(),
// <- reference to matrix C on device
tensor_d_scattered
.
device_data
(),
// <- reference to matrix D on device
tensor_a
.
layout
().
capacity
(
problem_size
.
mk
()),
tensor_b
.
layout
().
capacity
(
hytlass
::
make_Coord
(
options
.
index_size
,
problem_size
.
k
())),
tensor_c
.
layout
().
capacity
(
problem_size
.
mn
()),
tensor_d_scattered
.
layout
().
capacity
(
problem_size
.
mn
()),
tensor_a
.
layout
().
stride
(),
tensor_b
.
layout
().
stride
(),
tensor_c
.
layout
().
stride
(),
tensor_d_scattered
.
layout
().
stride
(),
nullptr
,
// <- pointer to index vector to gather A on device
tensor_indices
.
device_data
(),
// <- pointer to index vector to gather B on device
tensor_indices
.
device_data
()};
// <- pointer to index vector to scatter D on device
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t
workspace_size
=
Gemm
::
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
// Instantiate HYTLASS kernel depending on templates
Gemm
gemm_op
;
// Check the problem size is supported or not
hytlass
::
Status
status
=
gemm_op
.
can_implement
(
arguments
);
HYTLASS_CHECK
(
status
);
// Initialize HYTLASS kernel with arguments and workspace pointer
status
=
gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
HYTLASS_CHECK
(
status
);
// CPU reference calculation
hytlass
::
HostTensor
<
ElementOutput
,
LayoutOutput
>
tensor_d_ref
(
problem_size
.
mn
());
hytlass
::
reference
::
host
::
TensorFill
(
tensor_d_ref
.
host_view
());
// <- Fill matrix D on host with zeros
status
=
gemm_op
();
(
void
)
hipDeviceSynchronize
();
HYTLASS_CHECK
(
status
);
if
(
options
.
reference_check
)
{
for
(
int
i
=
0
;
i
<
problem_size
.
m
();
++
i
)
{
for
(
int
j
=
0
;
j
<
options
.
index_size
;
++
j
)
{
int
b_c_d_col
=
tensor_indices
.
at
({
j
,
0
});
for
(
int
k
=
0
;
k
<
problem_size
.
k
();
++
k
)
{
tensor_d_ref
.
at
({
i
,
b_c_d_col
})
+=
alpha
*
tensor_a
.
at
({
i
,
k
})
*
tensor_b
.
at
({
k
,
b_c_d_col
});
}
tensor_d_ref
.
at
({
i
,
b_c_d_col
})
+=
(
beta
*
tensor_c
.
at
({
i
,
b_c_d_col
}));
}
}
// Copy output data from HYTLASS and reference kernel to host for comparison
tensor_d_scattered
.
sync_host
();
bool
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
tensor_d_scattered
.
host_view
(),
tensor_d_ref
.
host_view
());
if
(
!
passed
)
{
std
::
cout
<<
"Failed!
\n
"
;
std
::
stringstream
fname
;
fname
<<
"error_gather_GEMM_scatter_fusion.txt"
;
std
::
cerr
<<
"Dumping results in "
<<
fname
.
str
()
<<
"
\n
"
;
std
::
ofstream
file
(
fname
.
str
());
file
<<
"A =
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n
B =
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n
indices =
\n
"
<<
tensor_indices
.
host_view
()
<<
"
\n
C =
\n
"
<<
tensor_c
.
host_view
()
<<
"
\n\n
Reference =
\n
"
<<
tensor_d_ref
.
host_view
()
<<
"
\n
Computed =
\n
"
<<
tensor_d_scattered
.
host_view
();
return
-
1
;
}
else
{
std
::
cout
<<
"Passed!
\n
"
;
}
}
// Result structure
Result
result
;
//
// Construct events
//
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
}
// warm up
for
(
int
iter
=
0
;
iter
<
10
;
++
iter
)
{
status
=
gemm_op
();
HYTLASS_CHECK
(
status
);
}
// Record an event at the start of a series of GEMMs
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
//
// Run profiling loop
//
for
(
int
iter
=
0
;
iter
<
options
.
iterations
;
++
iter
)
{
// Launch initialized HYTLASS kernel
status
=
gemm_op
();
HYTLASS_CHECK
(
status
);
}
//
// Stop profiling loop
//
// Record an event when the GEMMs are complete
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
// Compute average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
std
::
cout
<<
"Runtime: "
<<
result
.
runtime_ms
<<
" ms
\n
"
;
std
::
cout
<<
"GFLOPs: "
<<
result
.
gflops
<<
"
\n
"
;
return
0
;
}
int
main
(
int
argc
,
const
char
**
argv
)
{
bool
notSupported
=
false
;
Options
options
;
options
.
parse
(
argc
,
argv
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
"
\n
"
;
return
0
;
}
if
(
!
options
.
valid
())
{
std
::
cerr
<<
"Invalid problem."
<<
"
\n
"
;
return
-
1
;
}
return
run
(
options
);
}
examples/15_hute_group_gemm/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
hute_gfx928_group_gemm
hute_gfx928_group_gemm.cu
)
\ No newline at end of file
examples/15_hute_group_gemm/hute_gfx928_group_gemm.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2023 HYGON CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Grouped GEMM example using HYTLASS 3 APIs for DCU architecture.
For this example all scheduling work is performed on the device.
To run this example:
$ ./hute_gfx928_group_gemm --m=2048 --n=2048 --k=2048 --groups=10
The above example command makes all 10 groups to be sized at the given m, n, k sizes.
Skipping any of the problem dimensions randomizes it across the different groups.
Same applies for alpha and beta values that are randomized across the different groups.
To run this example for a set of problems using the benchmark option:
$ ./hute_gfx928_group_gemm --benchmark=./test_benchmark.txt
Where the test_benchmark.txt may look as such:
0 256x512x128
1 256x512x512
2 512x256x128
3 256x256x128
4 256x512x1024
5 1024x512x128 and so on
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <float.h>
#include "hytlass/hytlass.h"
#include "hute/tensor.hpp"
#include "hytlass/tensor_ref.h"
#include "hytlass/epilogue/collective/default_epilogue.hpp"
#include "hytlass/epilogue/thread/linear_combination.h"
#include "hytlass/gemm/dispatch_policy.hpp"
#include "hytlass/gemm/group_array_problem_shape.hpp"
#include "hytlass/gemm/collective/collective_builder.hpp"
#include "hytlass/epilogue/collective/collective_builder.hpp"
#include "hytlass/gemm/device/gemm_universal_adapter.h"
#include "hytlass/gemm/kernel/gemm_universal.hpp"
#include "hytlass/util/command_line.h"
#include "hytlass/util/distribution.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/packed_stride.hpp"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/reference/device/gemm.h"
#include "hytlass/util/reference/device/tensor_compare.h"
#include "hytlass/util/reference/device/tensor_fill.h"
#include "helper.h"
using
namespace
hute
;
using
ProblemShape
=
hytlass
::
gemm
::
GroupProblemShape
<
Shape
<
int
,
int
,
int
>>
;
// <M,N,K> per group
using
ElementA
=
hytlass
::
half_t
;
// Element type for A matrix operand
using
ElementB
=
hytlass
::
half_t
;
// Element type for B matrix operand
using
ElementC
=
hytlass
::
half_t
;
// Element type for C and D matrix operands
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM kernel configurations
/////////////////////////////////////////////////////////////////////////////////////////////////
// A matrix configuration
using
LayoutA
=
hytlass
::
layout
::
ColumnMajor
;
// Layout type for A matrix operand
constexpr
int
AlignmentA
=
128
/
hytlass
::
sizeof_bits
<
ElementA
>::
value
;
// Alignment of A matrix in units of elements (up to 16 bytes)
// B matrix configuration
using
LayoutB
=
hytlass
::
layout
::
RowMajor
;
// Layout type for B matrix operand
constexpr
int
AlignmentB
=
128
/
hytlass
::
sizeof_bits
<
ElementB
>::
value
;
// Alignment of B matrix in units of elements (up to 16 bytes)
// C/D matrix configuration
using
LayoutC
=
hytlass
::
layout
::
ColumnMajor
;
// Layout type for C and D matrix operands
constexpr
int
AlignmentC
=
128
/
hytlass
::
sizeof_bits
<
ElementC
>::
value
;
// Alignment of C matrix in units of elements (up to 16 bytes)
// Core kernel configurations
using
ElementAccumulator
=
float
;
// Element type for internal accumulation
using
ArchTag
=
hytlass
::
arch
::
Gfx928
;
// Tag indicating the minimum Gfx that supports the intended feature
using
OperatorClass
=
hytlass
::
arch
::
OpClassTensorOp
;
// Operator class tag
using
TileShape
=
Shape
<
_128
,
_128
,
_32
>
;
// Threadblock-level tile size
using
ClusterShape
=
Shape
<
_1
,
_1
,
_1
>
;
using
WarpShape_MNK
=
Shape
<
_2
,
_2
,
_1
>
;
using
InstructionShape_MNK
=
Shape
<
_32
,
_32
,
_16
>
;
using
StageCountType
=
hytlass
::
gemm
::
collective
::
StageCount
<
2
>
;
using
KernelSchedule
=
hytlass
::
gemm
::
KernelPtrArraySpecialized
;
// Kernel to launch
using
EpilogueSchedule
=
hytlass
::
epilogue
::
PtrArrayNoSmemWarpSpecialized
;
// Epilogue to launch
using
CollectiveEpilogue
=
typename
hytlass
::
epilogue
::
collective
::
CollectiveBuilder
<
hytlass
::
arch
::
Gfx928
,
hytlass
::
arch
::
OpClassTensorOp
,
TileShape
,
ClusterShape
,
hytlass
::
epilogue
::
collective
::
EpilogueTileAuto
,
ElementAccumulator
,
ElementAccumulator
,
ElementC
,
LayoutC
*
,
AlignmentC
,
ElementC
,
LayoutC
*
,
AlignmentC
,
EpilogueSchedule
>::
CollectiveOp
;
using
CollectiveMainloop
=
typename
hytlass
::
gemm
::
collective
::
CollectiveBuilder
<
ArchTag
,
OperatorClass
,
ElementA
,
LayoutA
*
,
AlignmentA
,
ElementB
,
LayoutB
*
,
AlignmentB
,
ElementAccumulator
,
TileShape
,
WarpShape_MNK
,
InstructionShape_MNK
,
ClusterShape
,
StageCountType
,
KernelSchedule
>::
CollectiveOp
;
using
GemmKernel
=
hytlass
::
gemm
::
kernel
::
GemmUniversal
<
ProblemShape
,
CollectiveMainloop
,
CollectiveEpilogue
>
;
using
Gemm
=
hytlass
::
gemm
::
device
::
GemmUniversalAdapter
<
GemmKernel
>
;
// Reference device GEMM implementation type
using
DeviceGemmReference
=
hytlass
::
reference
::
device
::
Gemm
<
ElementA
,
LayoutA
,
ElementB
,
LayoutB
,
ElementC
,
LayoutC
,
ElementAccumulator
,
ElementAccumulator
>
;
using
StrideA
=
typename
Gemm
::
GemmKernel
::
UnderlyingStrideA
;
using
StrideB
=
typename
Gemm
::
GemmKernel
::
UnderlyingStrideB
;
using
StrideC
=
typename
Gemm
::
GemmKernel
::
UnderlyingStrideC
;
using
StrideD
=
typename
Gemm
::
GemmKernel
::
UnderlyingStrideD
;
// Host-side allocations
std
::
vector
<
int64_t
>
offset_A
;
std
::
vector
<
int64_t
>
offset_B
;
std
::
vector
<
int64_t
>
offset_C
;
std
::
vector
<
int64_t
>
offset_D
;
std
::
vector
<
StrideA
>
stride_A_host
;
std
::
vector
<
StrideB
>
stride_B_host
;
std
::
vector
<
StrideC
>
stride_C_host
;
std
::
vector
<
StrideD
>
stride_D_host
;
std
::
vector
<
ElementAccumulator
>
alpha_host
;
std
::
vector
<
ElementAccumulator
>
beta_host
;
// Device-side allocations
hytlass
::
DeviceAllocation
<
typename
ProblemShape
::
UnderlyingProblemShape
>
problem_sizes
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
ElementA
>
block_A
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
ElementB
>
block_B
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
ElementC
>
block_C
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
>
block_D
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
>
block_ref_D
;
hytlass
::
DeviceAllocation
<
const
typename
Gemm
::
ElementA
*>
ptr_A
;
hytlass
::
DeviceAllocation
<
const
typename
Gemm
::
ElementB
*>
ptr_B
;
hytlass
::
DeviceAllocation
<
const
typename
Gemm
::
ElementC
*>
ptr_C
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
*>
ptr_D
;
hytlass
::
DeviceAllocation
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
*>
ptr_ref_D
;
hytlass
::
DeviceAllocation
<
StrideA
>
stride_A
;
hytlass
::
DeviceAllocation
<
StrideB
>
stride_B
;
hytlass
::
DeviceAllocation
<
StrideC
>
stride_C
;
hytlass
::
DeviceAllocation
<
StrideD
>
stride_D
;
// Note, this is an array of pointers to alpha and beta scaling values per group
hytlass
::
DeviceAllocation
<
ElementAccumulator
*>
alpha_device
;
hytlass
::
DeviceAllocation
<
ElementAccumulator
*>
beta_device
;
hytlass
::
DeviceAllocation
<
ElementAccumulator
>
block_alpha
;
hytlass
::
DeviceAllocation
<
ElementAccumulator
>
block_beta
;
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Testbed utility types
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
=
false
;
float
alpha
=
FLT_MAX
;
float
beta
=
FLT_MAX
;
int
iterations
=
10
;
int
m
=
1024
,
n
=
2048
,
k
=
512
,
groups
=
10
;
std
::
string
benchmark_path
;
std
::
vector
<
typename
ProblemShape
::
UnderlyingProblemShape
>
problem_sizes_host
;
int
const
tma_alignment_bits
=
128
;
int
const
alignment
=
tma_alignment_bits
/
hytlass
::
sizeof_bits
<
ElementA
>::
value
;
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
return
;
}
cmd
.
get_cmd_line_argument
(
"m"
,
m
);
cmd
.
get_cmd_line_argument
(
"n"
,
n
);
cmd
.
get_cmd_line_argument
(
"k"
,
k
);
cmd
.
get_cmd_line_argument
(
"groups"
,
groups
);
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
,
FLT_MAX
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
,
FLT_MAX
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
cmd
.
get_cmd_line_argument
(
"benchmark"
,
benchmark_path
);
// Decide how to initialize the problems
if
(
!
benchmark_path
.
empty
())
{
if
(
!
benchmark_problems
())
{
problem_sizes_host
.
clear
();
return
;
}
}
else
{
randomize_problems
(
cmd
);
}
}
void
randomize_problems
(
hytlass
::
CommandLine
&
cmd
)
{
int
cmd_line_m
=
-
1
,
cmd_line_n
=
-
1
,
cmd_line_k
=
-
1
;
cmd
.
get_cmd_line_argument
(
"m"
,
cmd_line_m
);
cmd
.
get_cmd_line_argument
(
"n"
,
cmd_line_n
);
cmd
.
get_cmd_line_argument
(
"k"
,
cmd_line_k
);
problem_sizes_host
.
reserve
(
groups
);
for
(
int
i
=
groups
;
i
>
0
;
i
--
)
{
int
m
=
cmd_line_m
;
int
n
=
cmd_line_n
;
int
k
=
cmd_line_k
;
if
(
m
<
1
)
{
m
=
((
rand
()
%
512
)
+
1
);
}
if
(
n
<
1
)
{
n
=
((
rand
()
%
512
)
+
1
);
}
if
(
k
<
1
)
{
k
=
alignment
*
((
rand
()
%
64
)
+
1
);
}
problem_sizes_host
.
push_back
({
m
,
n
,
k
});
}
}
/// Load a benchmark
bool
benchmark_problems
()
{
std
::
ifstream
file
(
benchmark_path
);
if
(
!
file
.
good
())
{
return
false
;
}
while
(
file
.
good
())
{
int
idx
=
-
1
;
std
::
string
extent_str
;
file
>>
idx
>>
extent_str
;
if
(
idx
<
0
||
extent_str
.
empty
())
{
break
;
}
hytlass
::
gemm
::
GemmCoord
extent
;
std
::
vector
<
std
::
string
>
tokens
;
hytlass
::
CommandLine
::
tokenize
(
tokens
,
extent_str
,
'x'
);
for
(
int
i
=
0
;
i
<
int
(
tokens
.
size
());
++
i
)
{
int
x
=
std
::
atoi
(
tokens
.
at
(
i
).
c_str
());
// round up
if
(
x
%
alignment
)
{
x
+=
(
alignment
-
(
x
%
alignment
));
}
extent
.
at
(
i
)
=
x
;
}
if
(
extent
.
product
())
{
problem_sizes_host
.
push_back
({
extent
.
m
(),
extent
.
n
(),
extent
.
k
()});
}
}
groups
=
static_cast
<
int
>
(
problem_sizes_host
.
size
());
return
true
;
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"hute_group_gemm
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement
\n\n
"
<<
" --m=<int> Sets the M extent of the GEMM for all groups
\n
"
<<
" --n=<int> Sets the N extent of the GEMM for all groups
\n
"
<<
" --k=<int> Sets the K extent of the GEMM for all groups
\n
"
<<
" --groups=<int> Sets the number of individual GEMM problems for Grouped GEMM
\n
"
<<
" --alpha=<f32> Epilogue scalar alpha
\n
"
<<
" --beta=<f32> Epilogue scalar beta
\n\n
"
<<
" --iterations=<int> Number of profiling iterations to perform
\n\n
"
<<
" --benchmark=<str> Executes a benchmark problem size.
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ "
<<
"hute_gfx928_group_gemm"
<<
" --m=1024 --n=512 --k=1024 --groups=10 --alpha=2 --beta=0.707
\n\n
"
;
return
out
;
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
,
std
::
vector
<
typename
ProblemShape
::
UnderlyingProblemShape
>
problem_sizes_host
)
const
{
// Number of real-valued multiply-adds
uint64_t
fmas
=
uint64_t
();
for
(
auto
const
&
problem
:
problem_sizes_host
)
{
fmas
+=
static_cast
<
uint64_t
>
(
get
<
0
>
(
problem
))
*
static_cast
<
uint64_t
>
(
get
<
1
>
(
problem
))
*
static_cast
<
uint64_t
>
(
get
<
2
>
(
problem
));
}
// Two flops per multiply-add
uint64_t
flop
=
uint64_t
(
2
)
*
uint64_t
(
fmas
);
double
gflop
=
double
(
flop
)
/
double
(
1.0e9
);
return
gflop
/
runtime_s
;
}
};
/// Result structure
struct
Result
{
double
avg_runtime_ms
=
0.0
;
double
gflops
=
0.0
;
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
;
hipError_t
error
=
hipSuccess
;
bool
passed
=
false
;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// GEMM setup and evaluation
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Helper to initialize a block of device data
template
<
class
Element
>
bool
initialize_block
(
hytlass
::
DeviceAllocation
<
Element
>&
block
,
uint64_t
seed
=
2023
)
{
Element
scope_max
,
scope_min
;
int
bits_input
=
hytlass
::
sizeof_bits
<
Element
>::
value
;
if
(
bits_input
==
1
)
{
scope_max
=
static_cast
<
Element
>
(
1
);
scope_min
=
static_cast
<
Element
>
(
1
);
}
else
if
(
bits_input
<=
8
)
{
scope_max
=
static_cast
<
Element
>
(
1
);
scope_min
=
static_cast
<
Element
>
(
1
);
}
else
{
scope_max
=
static_cast
<
Element
>
(
4
);
scope_min
=
static_cast
<
Element
>
(
2
);
}
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block
.
get
(),
block
.
size
(),
seed
,
scope_max
,
scope_min
,
0
);
return
true
;
}
/// Allocates device-side data
void
allocate
(
const
Options
&
options
)
{
int64_t
total_elements_A
=
0
;
int64_t
total_elements_B
=
0
;
int64_t
total_elements_C
=
0
;
int64_t
total_elements_D
=
0
;
for
(
int32_t
i
=
0
;
i
<
options
.
groups
;
++
i
)
{
auto
problem
=
options
.
problem_sizes_host
.
at
(
i
);
auto
M
=
get
<
0
>
(
problem
);
auto
N
=
get
<
1
>
(
problem
);
auto
K
=
get
<
2
>
(
problem
);
offset_A
.
push_back
(
total_elements_A
);
offset_B
.
push_back
(
total_elements_B
);
offset_C
.
push_back
(
total_elements_C
);
offset_D
.
push_back
(
total_elements_D
);
int64_t
elements_A
=
M
*
K
;
int64_t
elements_B
=
K
*
N
;
int64_t
elements_C
=
M
*
N
;
int64_t
elements_D
=
M
*
N
;
total_elements_A
+=
elements_A
;
total_elements_B
+=
elements_B
;
total_elements_C
+=
elements_C
;
total_elements_D
+=
elements_D
;
stride_A_host
.
push_back
(
hytlass
::
make_hute_packed_stride
(
StrideA
{},
hute
::
make_shape
(
M
,
K
,
Int
<
1
>
{})));
stride_B_host
.
push_back
(
hytlass
::
make_hute_packed_stride
(
StrideB
{},
hute
::
make_shape
(
N
,
K
,
Int
<
1
>
{})));
stride_C_host
.
push_back
(
hytlass
::
make_hute_packed_stride
(
StrideC
{},
hute
::
make_shape
(
M
,
N
,
Int
<
1
>
{})));
stride_D_host
.
push_back
(
hytlass
::
make_hute_packed_stride
(
StrideD
{},
hute
::
make_shape
(
M
,
N
,
Int
<
1
>
{})));
}
block_A
.
reset
(
total_elements_A
);
block_B
.
reset
(
total_elements_B
);
block_C
.
reset
(
total_elements_C
);
block_D
.
reset
(
total_elements_D
);
block_ref_D
.
reset
(
total_elements_D
);
block_alpha
.
reset
(
options
.
groups
);
block_beta
.
reset
(
options
.
groups
);
}
/// Initialize operands to be used in the GEMM and reference GEMM
void
initialize
(
const
Options
&
options
)
{
uint64_t
seed
=
2020
;
problem_sizes
.
reset
(
options
.
groups
);
problem_sizes
.
copy_from_host
(
options
.
problem_sizes_host
.
data
());
//
// Assign pointers
//
std
::
vector
<
ElementA
*>
ptr_A_host
(
options
.
groups
);
std
::
vector
<
ElementB
*>
ptr_B_host
(
options
.
groups
);
std
::
vector
<
ElementC
*>
ptr_C_host
(
options
.
groups
);
std
::
vector
<
ElementC
*>
ptr_D_host
(
options
.
groups
);
std
::
vector
<
ElementAccumulator
*>
ptr_alpha_host
(
options
.
groups
);
std
::
vector
<
ElementAccumulator
*>
ptr_beta_host
(
options
.
groups
);
for
(
int32_t
i
=
0
;
i
<
options
.
groups
;
++
i
)
{
ptr_A_host
.
at
(
i
)
=
block_A
.
get
()
+
offset_A
.
at
(
i
);
ptr_B_host
.
at
(
i
)
=
block_B
.
get
()
+
offset_B
.
at
(
i
);
ptr_C_host
.
at
(
i
)
=
block_C
.
get
()
+
offset_C
.
at
(
i
);
ptr_D_host
.
at
(
i
)
=
block_D
.
get
()
+
offset_D
.
at
(
i
);
alpha_host
.
push_back
((
options
.
alpha
==
FLT_MAX
)
?
static_cast
<
ElementAccumulator
>
((
rand
()
%
5
)
+
1
)
:
options
.
alpha
);
beta_host
.
push_back
((
options
.
beta
==
FLT_MAX
)
?
static_cast
<
ElementAccumulator
>
(
rand
()
%
5
)
:
options
.
beta
);
ptr_alpha_host
.
at
(
i
)
=
block_alpha
.
get
()
+
i
;
ptr_beta_host
.
at
(
i
)
=
block_beta
.
get
()
+
i
;
}
ptr_A
.
reset
(
options
.
groups
);
ptr_A
.
copy_from_host
(
ptr_A_host
.
data
());
ptr_B
.
reset
(
options
.
groups
);
ptr_B
.
copy_from_host
(
ptr_B_host
.
data
());
ptr_C
.
reset
(
options
.
groups
);
ptr_C
.
copy_from_host
(
ptr_C_host
.
data
());
ptr_D
.
reset
(
options
.
groups
);
ptr_D
.
copy_from_host
(
ptr_D_host
.
data
());
stride_A
.
reset
(
options
.
groups
);
stride_A
.
copy_from_host
(
stride_A_host
.
data
());
stride_B
.
reset
(
options
.
groups
);
stride_B
.
copy_from_host
(
stride_B_host
.
data
());
stride_C
.
reset
(
options
.
groups
);
stride_C
.
copy_from_host
(
stride_C_host
.
data
());
stride_D
.
reset
(
options
.
groups
);
stride_D
.
copy_from_host
(
stride_D_host
.
data
());
alpha_device
.
reset
(
options
.
groups
);
alpha_device
.
copy_from_host
(
ptr_alpha_host
.
data
());
beta_device
.
reset
(
options
.
groups
);
beta_device
.
copy_from_host
(
ptr_beta_host
.
data
());
initialize_block
(
block_A
,
seed
+
2023
);
initialize_block
(
block_B
,
seed
+
2022
);
initialize_block
(
block_C
,
seed
+
2021
);
block_alpha
.
copy_from_host
(
alpha_host
.
data
());
block_beta
.
copy_from_host
(
beta_host
.
data
());
}
/// Populates a Gemm::Arguments structure from the given commandline options
typename
Gemm
::
Arguments
args_from_options
(
const
Options
&
options
,
bool
host_problem_shapes_available
=
true
)
{
hytlass
::
KernelHardwareInfo
hw_info
;
// Change device_id to another value if you are running on a machine with multiple GPUs and wish
// to use a GPU other than that with device ID 0.
hw_info
.
device_id
=
0
;
hw_info
.
sm_count
=
hytlass
::
KernelHardwareInfo
::
query_device_multiprocessor_count
(
hw_info
.
device_id
);
typename
Gemm
::
EpilogueOutputOp
::
Params
params
;
if
(
options
.
alpha
!=
FLT_MAX
&&
options
.
beta
!=
FLT_MAX
)
{
// If both alpha/beta are provided (via cmd line args) and are scalar, i.e., same alpha/beta applies to all batches.
params
=
typename
Gemm
::
EpilogueOutputOp
::
Params
(
ElementAccumulator
(
options
.
alpha
),
ElementAccumulator
(
options
.
beta
));
}
else
{
// If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups.
params
=
typename
Gemm
::
EpilogueOutputOp
::
Params
(
alpha_device
.
get
(),
beta_device
.
get
());
}
typename
Gemm
::
Arguments
arguments
;
if
(
host_problem_shapes_available
)
{
arguments
=
typename
Gemm
::
Arguments
{
hytlass
::
gemm
::
GemmUniversalMode
::
kGrouped
,
{
options
.
groups
,
problem_sizes
.
get
(),
options
.
problem_sizes_host
.
data
()},
{
ptr_A
.
get
(),
stride_A
.
get
(),
ptr_B
.
get
(),
stride_B
.
get
()},
{
params
,
ptr_C
.
get
(),
stride_C
.
get
(),
ptr_D
.
get
(),
stride_D
.
get
()},
hw_info
};
}
else
{
arguments
=
typename
Gemm
::
Arguments
{
hytlass
::
gemm
::
GemmUniversalMode
::
kGrouped
,
{
options
.
groups
,
problem_sizes
.
get
(),
nullptr
},
{
ptr_A
.
get
(),
stride_A
.
get
(),
ptr_B
.
get
(),
stride_B
.
get
()},
{
params
,
ptr_C
.
get
(),
stride_C
.
get
(),
ptr_D
.
get
(),
stride_D
.
get
()},
hw_info
};
}
return
arguments
;
}
bool
verify
(
const
Options
&
options
)
{
bool
passed
=
true
;
for
(
int32_t
i
=
0
;
i
<
options
.
groups
;
++
i
)
{
auto
problem
=
options
.
problem_sizes_host
.
at
(
i
);
auto
M
=
get
<
0
>
(
problem
);
auto
N
=
get
<
1
>
(
problem
);
auto
K
=
get
<
2
>
(
problem
);
hytlass
::
TensorRef
ref_A
(
block_A
.
get
()
+
offset_A
.
at
(
i
),
Gemm
::
LayoutA
::
packed
({
M
,
K
}));
hytlass
::
TensorRef
ref_B
(
block_B
.
get
()
+
offset_B
.
at
(
i
),
Gemm
::
LayoutB
::
packed
({
K
,
N
}));
hytlass
::
TensorRef
ref_C
(
block_C
.
get
()
+
offset_C
.
at
(
i
),
Gemm
::
LayoutC
::
packed
({
M
,
N
}));
hytlass
::
TensorRef
ref_D
(
block_ref_D
.
get
()
+
offset_D
.
at
(
i
),
Gemm
::
LayoutD
::
packed
({
M
,
N
}));
//
// Compute reference output
//
// Create instantiation for device reference gemm kernel
DeviceGemmReference
gemm_reference
;
// Launch device reference gemm kernel
gemm_reference
(
{
M
,
N
,
K
},
ElementAccumulator
(
alpha_host
.
at
(
i
)),
ref_A
,
ref_B
,
ElementAccumulator
(
beta_host
.
at
(
i
)),
ref_C
,
ref_D
);
// Wait for kernel to finish
HIP_CHECK
(
hipDeviceSynchronize
());
// Check if output from HYTLASS kernel and reference kernel are equal or not
passed
&=
hytlass
::
reference
::
device
::
BlockCompareEqual
(
block_ref_D
.
get
()
+
offset_D
.
at
(
i
),
block_D
.
get
()
+
offset_D
.
at
(
i
),
M
*
N
);
#if 0
std::cout << "Group: " << i << " Status: " << passed << std::endl;
#endif
}
return
passed
;
}
/// Execute a given example GEMM computation
template
<
typename
Gemm
>
int
run
(
Options
&
options
,
bool
host_problem_shapes_available
=
true
)
{
allocate
(
options
);
initialize
(
options
);
// Instantiate HYTLASS kernel depending on templates
Gemm
gemm
;
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
auto
arguments
=
args_from_options
(
options
,
host_problem_shapes_available
);
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t
workspace_size
=
Gemm
::
get_workspace_size
(
arguments
);
// Allocate workspace memory
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
// Check if the problem size is supported or not
HYTLASS_CHECK
(
gemm
.
can_implement
(
arguments
));
// Initialize HYTLASS kernel with arguments and workspace pointer
HYTLASS_CHECK
(
gemm
.
initialize
(
arguments
,
workspace
.
get
()));
// Correctness / Warmup iteration
HYTLASS_CHECK
(
gemm
.
run
());
// Check if output from HYTLASS kernel and reference kernel are equal or not
Result
result
;
result
.
passed
=
verify
(
options
);
std
::
cout
<<
" Disposition: "
<<
(
result
.
passed
?
"Passed"
:
"Failed"
)
<<
std
::
endl
;
if
(
!
result
.
passed
)
{
exit
(
-
1
);
}
// Run profiling loop
if
(
options
.
iterations
>
0
)
{
GpuTimer
timer
;
timer
.
start
();
for
(
int
iter
=
0
;
iter
<
options
.
iterations
;
++
iter
)
{
HYTLASS_CHECK
(
gemm
.
initialize
(
arguments
,
workspace
.
get
()));
HYTLASS_CHECK
(
gemm
.
run
());
}
timer
.
stop
();
// Compute average setup and runtime and GFLOPs.
float
elapsed_ms
=
timer
.
elapsed_millis
();
result
.
avg_runtime_ms
=
double
(
elapsed_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
avg_runtime_ms
/
1000.0
,
options
.
problem_sizes_host
);
std
::
cout
<<
" Problem Sizes, Alpha, Beta "
<<
std
::
endl
;
for
(
int32_t
i
=
0
;
i
<
options
.
groups
;
++
i
)
{
std
::
cout
<<
" "
<<
options
.
problem_sizes_host
.
at
(
i
);
std
::
cout
<<
", "
<<
alpha_host
.
at
(
i
)
<<
", "
<<
beta_host
.
at
(
i
)
<<
std
::
endl
;
}
std
::
cout
<<
" Groups : "
<<
options
.
groups
<<
std
::
endl
;
std
::
cout
<<
" Avg runtime : "
<<
result
.
avg_runtime_ms
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
" GFLOPS : "
<<
result
.
gflops
<<
std
::
endl
;
}
return
0
;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
//
// Parse options
//
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
//
// Evaluate HYTLASS kernels
//
run
<
Gemm
>
(
options
);
run
<
Gemm
>
(
options
,
false
/*host_problem_shapes_available*/
);
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/16_hytlass_gemm_softmax/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
gemm_softmax
gemm_softmax.cu
)
examples/16_hytlass_gemm_softmax/gemm_softmax.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
*/
#include <cmath>
#include <iostream>
#include <vector>
#include <limits>
#include "hytlass/hytlass.h"
#include "hytlass/arch/memory.h"
#include "hytlass/arch/memory_gfx928.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/host/gemm_complex.h"
#include "hytlass/util/reference/device/gemm_complex.h"
#include "hytlass/util/reference/host/tensor_reduce.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_norm.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/device/tensor_fill.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/error_metrics.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/layout/matrix.h"
#include "hytlass/epilogue/thread/linear_combination.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
#include "gemm_with_softmax.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
#define TRACE(x) { std::cout << "gemm_softmax.cu:" << __LINE__ << " " << x << std::endl; }
/////////////////////////////////////////////////////////////////////////////////////////////////
enum
class
Disposition
{
kPassed
,
kIncorrect
,
kNotVerified
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
hytlass
::
gemm
::
GemmCoord
problem_size
;
int
batch_count
;
int
iterations
;
unsigned
seed
;
float
alpha
;
float
beta
;
bool
verification_enabled
;
float
tolerance
;
Options
()
:
help
(
false
),
problem_size
({
16
,
24
,
64
}),
batch_count
(
16
),
iterations
(
20
),
seed
(
2022
),
alpha
(
1
),
beta
(
0
),
verification_enabled
(
true
),
tolerance
(
1e-5
f
)
{
}
bool
valid
()
{
return
true
;
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
cmd
.
get_cmd_line_argument
(
"m"
,
problem_size
.
m
());
cmd
.
get_cmd_line_argument
(
"n"
,
problem_size
.
n
());
cmd
.
get_cmd_line_argument
(
"k"
,
problem_size
.
k
());
cmd
.
get_cmd_line_argument
(
"batch_count"
,
batch_count
);
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
);
cmd
.
get_cmd_line_argument
(
"verify"
,
verification_enabled
);
cmd
.
get_cmd_line_argument
(
"seed"
,
seed
);
cmd
.
get_cmd_line_argument
(
"tolerance"
,
tolerance
);
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"16_gemm_softmax example
\n\n
"
<<
" This example uses the HYTLASS Library to compute GEMM + Softmax for arbitrary problem sizes.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --m=<int> GEMM M dimension
\n
"
<<
" --n=<int> GEMM N dimension
\n
"
<<
" --k=<int> GEMM K dimension
\n
"
<<
" --batch_count=<int> Batch number
\n
"
<<
" --alpha=<f32> Epilogue scalar alpha
\n
"
<<
" --beta=<f32> Epilogue scalar beta
\n\n
"
<<
" --seed=<int> Random number seed (1*)
\n\n
"
<<
" --iterations=<int> Number of profiling iterations to perform (0 to disable profiling).
\n\n
"
<<
" --verify=<bool> If true, performs reference calculation.
\n\n
"
<<
" --tolerance <float> Error tolerance
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"$ ./gemm_softmax --m=1024 --n=512
\\\n
"
<<
" --alpha=2 --beta=0.707
\n\n
"
;
return
out
;
}
/// Returns true if the environment and Toolkit support this
bool
supported
(
bool
verbose
=
true
)
const
{
return
true
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct
Testbed
{
//
// Type definitions
//
using
ElementA
=
hytlass
::
half_t
;
using
ElementB
=
hytlass
::
half_t
;
using
ElementC
=
hytlass
::
half_t
;
using
ElementCompute
=
float
;
using
ElementD
=
ElementC
;
using
ElementSoftmax
=
ElementC
;
using
LayoutA
=
hytlass
::
layout
::
RowMajor
;
using
LayoutB
=
hytlass
::
layout
::
ColumnMajor
;
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
128
,
128
,
32
>
;
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
64
,
64
,
32
>
;
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
;
using
OperatorClass
=
hytlass
::
arch
::
OpClassTensorOp
;
using
ArchTag
=
hytlass
::
arch
::
Gfx928
;
// ApplyShape impacts the final Softmax performance a lot.
// Set ApplyShape::kColumn to be the next multiple of 32 number that is after
// (gemm_N / alignment).
// Set ApplyShape::kRow to max(1, 128 / ApplyShape::kColumn).
using
ApplyShape
=
hytlass
::
MatrixShape
<
1
,
1024
>
;
static
int
const
kStages
=
1
;
/// Linear scaling operator
using
EpilogueFunctorOp
=
hytlass
::
epilogue
::
thread
::
LinearCombination
<
ElementC
,
128
/
hytlass
::
sizeof_bits
<
ElementC
>::
value
,
ElementCompute
,
ElementCompute
>
;
using
GemmSoftmax
=
hytlass
::
GemmSoftmax
<
ElementA
,
LayoutA
,
ElementB
,
LayoutB
,
ElementC
,
ElementCompute
,
OperatorClass
,
ArchTag
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueFunctorOp
,
kStages
,
ApplyShape
>
;
using
ElementNorm
=
typename
GemmSoftmax
::
ElementNorm
;
using
ElementSum
=
typename
GemmSoftmax
::
ElementSum
;
using
LayoutC
=
typename
GemmSoftmax
::
LayoutC
;
using
LayoutN
=
typename
GemmSoftmax
::
LayoutN
;
using
LayoutS
=
typename
GemmSoftmax
::
LayoutS
;
using
MatrixCoord
=
typename
LayoutC
::
TensorCoord
;
//
// Data members
//
Options
const
&
options
;
hytlass
::
HostTensor
<
ElementNorm
,
LayoutC
>
reference_N
;
hytlass
::
DeviceAllocation
<
ElementA
>
block_A
;
hytlass
::
DeviceAllocation
<
ElementB
>
block_B
;
hytlass
::
DeviceAllocation
<
ElementC
>
block_C
;
hytlass
::
DeviceAllocation
<
ElementD
>
block_D
;
hytlass
::
DeviceAllocation
<
ElementD
>
block_Ref
;
hytlass
::
DeviceAllocation
<
ElementSoftmax
>
block_Softmax
;
hytlass
::
DeviceAllocation
<
ElementNorm
>
block_Norm
;
hytlass
::
DeviceAllocation
<
ElementSum
>
block_Sum
;
int
block_num
=
(
options
.
problem_size
.
n
()
+
GemmSoftmax
::
ThreadblockShape
::
kN
-
1
)
/
GemmSoftmax
::
ThreadblockShape
::
kN
;
hytlass
::
gemm
::
GemmCoord
problem
=
options
.
problem_size
;
int64_t
lda
=
LayoutA
::
packed
({
problem
.
m
(),
problem
.
k
()}).
stride
(
0
);
int64_t
ldb
=
LayoutB
::
packed
({
problem
.
k
(),
problem
.
n
()}).
stride
(
0
);
int64_t
ldc
=
LayoutC
::
packed
({
problem
.
m
(),
problem
.
n
()}).
stride
(
0
);
// fixed rowmajor for norm and sum
int64_t
ldn
=
problem
.
m
();
int64_t
lds
=
ldn
;
int64_t
total_elements_A_per_batch
=
problem
.
m
()
*
problem
.
k
();
int64_t
total_elements_B_per_batch
=
problem
.
k
()
*
problem
.
n
();
int64_t
total_elements_C_per_batch
=
problem
.
m
()
*
problem
.
n
();
int64_t
total_elements_D_per_batch
=
problem
.
m
()
*
problem
.
n
();
int64_t
total_elements_partial_norm_per_batch
=
block_num
*
problem
.
m
();
int64_t
total_elements_A
=
total_elements_A_per_batch
*
options
.
batch_count
;
int64_t
total_elements_B
=
total_elements_B_per_batch
*
options
.
batch_count
;
int64_t
total_elements_C
=
total_elements_C_per_batch
*
options
.
batch_count
;
int64_t
total_elements_D
=
total_elements_D_per_batch
*
options
.
batch_count
;
int64_t
total_elements_partial_norm
=
total_elements_partial_norm_per_batch
*
options
.
batch_count
;
//
// Methods
//
Testbed
(
Options
const
&
options_
)
:
options
(
options_
)
{
reference_N
.
reset
({
options
.
problem_size
.
m
(),
1
},
false
);
}
/// Run
Disposition
run
()
{
Disposition
disposition
=
Disposition
::
kNotVerified
;
//
// Initialize the workspace
//
initialize
();
//
// Launch device kernel
//
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
;
status
=
execute_device_kernel
();
if
(
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Device execution failed."
<<
std
::
endl
;
return
disposition
;
}
hipError_t
result
=
hipDeviceSynchronize
();
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"Device synchronize failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
disposition
;
}
//
// Verify
//
if
(
options
.
verification_enabled
)
{
bool
passed
=
verify
();
if
(
passed
)
{
disposition
=
Disposition
::
kPassed
;
}
else
{
disposition
=
Disposition
::
kIncorrect
;
}
}
//
// Profiling
//
if
(
options
.
iterations
)
{
profile
();
}
return
disposition
;
}
/// Random initialization
void
initialize
()
{
block_A
.
reset
(
total_elements_A
);
block_B
.
reset
(
total_elements_B
);
block_C
.
reset
(
total_elements_C
);
block_D
.
reset
(
total_elements_D
);
block_Softmax
.
reset
(
total_elements_D
);
block_Ref
.
reset
(
total_elements_D_per_batch
);
block_Norm
.
reset
(
total_elements_partial_norm
);
block_Sum
.
reset
(
total_elements_partial_norm
);
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block_A
.
get
(),
total_elements_A
,
options
.
seed
,
ElementA
(
5
),
ElementA
(
-
5
),
0
);
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block_B
.
get
(),
total_elements_B
,
options
.
seed
+
1
,
ElementB
(
5
),
ElementB
(
-
5
),
0
);
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block_C
.
get
(),
total_elements_C
,
options
.
seed
+
2
,
ElementC
(
5
),
ElementC
(
-
5
),
0
);
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block_D
.
get
(),
total_elements_D
,
options
.
seed
+
3
,
ElementD
(
5
),
ElementD
(
-
5
),
0
);
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block_Ref
.
get
(),
total_elements_D_per_batch
,
options
.
seed
+
3
,
ElementD
(
5
),
ElementD
(
-
5
),
0
);
hytlass
::
reference
::
device
::
BlockFillRandomUniform
(
block_Softmax
.
get
(),
total_elements_D
,
options
.
seed
+
3
,
ElementSoftmax
(
5
),
ElementSoftmax
(
-
5
),
0
);
hytlass
::
reference
::
host
::
TensorFill
(
reference_N
.
host_view
(),
ElementNorm
()
);
}
hytlass
::
Status
execute_device_kernel
()
{
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
;
//
// Setup arguments
//
GemmSoftmax
::
Arguments
args
(
options
.
problem_size
,
options
.
batch_count
,
{
block_A
.
get
(),
lda
},
{
block_B
.
get
(),
ldb
},
{
block_C
.
get
(),
ldc
},
{
block_D
.
get
(),
ldc
},
{
ElementCompute
(
options
.
alpha
),
ElementCompute
(
options
.
beta
)
},
{
block_Norm
.
get
(),
ldn
},
{
block_Sum
.
get
(),
lds
},
{
block_Softmax
.
get
(),
ldc
},
total_elements_A_per_batch
,
total_elements_B_per_batch
,
total_elements_C_per_batch
,
total_elements_D_per_batch
,
total_elements_partial_norm_per_batch
,
total_elements_partial_norm_per_batch
,
total_elements_D_per_batch
);
//
// Launch
//
GemmSoftmax
gemm_softmax
;
// Initialize
status
=
gemm_softmax
.
initialize
(
args
);
if
(
status
!=
hytlass
::
Status
::
kSuccess
)
{
return
status
;
}
// Run
status
=
gemm_softmax
();
return
status
;
}
template
<
typename
Element
>
bool
verify_tensor
(
std
::
vector
<
Element
>
vector_Input
,
\
std
::
vector
<
Element
>
vector_Input_Ref
)
{
auto
size
=
int64_t
((
vector_Input
.
size
()
<
vector_Input_Ref
.
size
())
?
vector_Input
.
size
()
:
vector_Input_Ref
.
size
());
float
abs_tol
=
options
.
tolerance
;
float
rel_tol
=
options
.
tolerance
;
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
float
diff
=
(
float
)(
vector_Input
.
at
(
i
)
-
vector_Input_Ref
.
at
(
i
));
float
abs_diff
=
fabs
(
diff
);
float
abs_ref
=
fabs
((
float
)
vector_Input_Ref
.
at
(
i
));
float
relative_diff
=
abs_ref
>
abs_tol
?
abs_diff
/
abs_ref
:
0
;
if
(
(
isnan
(
abs_diff
)
||
isinf
(
abs_diff
))
||
(
abs_diff
>
rel_tol
&&
relative_diff
>
rel_tol
))
{
printf
(
"diff = %f, {%f, %f}.
\n
"
,
abs_diff
,
(
float
)(
vector_Input
.
at
(
i
)),
(
float
)(
vector_Input_Ref
.
at
(
i
)));
return
false
;
}
}
return
true
;
}
/// Verifies the reference matches
bool
verify
()
{
LayoutA
layout_A
(
lda
);
LayoutB
layout_B
(
ldb
);
LayoutC
layout_C
(
ldc
);
LayoutN
Layout_N
(
ldn
);
LayoutS
Layout_S
(
lds
);
MatrixCoord
extent_A
{
problem
.
m
(),
problem
.
k
()};
MatrixCoord
extent_B
{
problem
.
k
(),
problem
.
n
()};
MatrixCoord
extent_C
{
problem
.
m
(),
problem
.
n
()};
for
(
int
batch_idx
=
0
;
batch_idx
<
options
.
batch_count
;
batch_idx
++
)
{
hytlass
::
TensorView
<
ElementA
,
LayoutA
>
view_A
(
block_A
.
get
()
+
total_elements_A_per_batch
*
batch_idx
,
layout_A
,
extent_A
);
hytlass
::
TensorView
<
ElementB
,
LayoutB
>
view_B
(
block_B
.
get
()
+
total_elements_B_per_batch
*
batch_idx
,
layout_B
,
extent_B
);
hytlass
::
TensorView
<
ElementC
,
LayoutC
>
view_C
(
block_C
.
get
()
+
total_elements_C_per_batch
*
batch_idx
,
layout_C
,
extent_C
);
hytlass
::
TensorView
<
ElementC
,
LayoutC
>
view_Ref_device
(
block_Ref
.
get
(),
layout_C
,
extent_C
);
hytlass
::
reference
::
device
::
GemmComplex
<
ElementA
,
LayoutA
,
ElementB
,
LayoutB
,
ElementC
,
LayoutC
,
ElementCompute
,
ElementCompute
>
(
problem
,
options
.
alpha
,
view_A
,
hytlass
::
ComplexTransform
::
kNone
,
view_B
,
hytlass
::
ComplexTransform
::
kNone
,
options
.
beta
,
view_C
,
view_Ref_device
,
ElementCompute
(
0
)
);
// Copy reference results to host memory for verification
std
::
vector
<
ElementD
>
matrix_D_Ref
(
layout_C
.
capacity
(
extent_C
));
hytlass
::
device_memory
::
copy_to_host
(
matrix_D_Ref
.
data
(),
block_Ref
.
get
(),
matrix_D_Ref
.
size
());
hytlass
::
TensorView
<
ElementD
,
LayoutC
>
view_Ref
(
matrix_D_Ref
.
data
(),
layout_C
,
extent_C
);
std
::
vector
<
ElementSoftmax
>
matrix_Softmax_Ref
(
layout_C
.
capacity
(
extent_C
));
hytlass
::
TensorView
<
ElementSoftmax
,
LayoutC
>
view_Softmax_Ref
(
matrix_Softmax_Ref
.
data
(),
layout_C
,
extent_C
);
// Copy computed results to host memory
std
::
vector
<
ElementD
>
matrix_D
(
layout_C
.
capacity
(
extent_C
));
hytlass
::
device_memory
::
copy_to_host
(
matrix_D
.
data
(),
block_D
.
get
()
+
total_elements_D_per_batch
*
batch_idx
,
matrix_D
.
size
());
std
::
vector
<
ElementD
>
matrix_Softmax
(
layout_C
.
capacity
(
extent_C
));
hytlass
::
device_memory
::
copy_to_host
(
matrix_Softmax
.
data
(),
block_Softmax
.
get
()
+
total_elements_D_per_batch
*
batch_idx
,
matrix_Softmax
.
size
());
// Compute the norm
for
(
int
m
=
0
;
m
<
options
.
problem_size
.
m
();
++
m
)
{
reference_N
.
at
({
m
,
0
})
=
view_Ref
.
ref
().
at
({
m
,
0
});
for
(
int
n
=
1
;
n
<
options
.
problem_size
.
n
();
++
n
)
{
reference_N
.
at
({
m
,
0
})
=
std
::
max
(
reference_N
.
at
({
m
,
0
}),
ElementNorm
(
view_Ref
.
ref
().
at
({
m
,
n
})));
}
}
// Compute softmax
for
(
int
m
=
0
;
m
<
options
.
problem_size
.
m
();
++
m
)
{
float
sum
=
float
();
for
(
int
n
=
0
;
n
<
options
.
problem_size
.
n
();
++
n
)
{
sum
+=
std
::
exp
(
float
(
view_Ref
.
ref
().
at
({
m
,
n
}))
-
float
(
reference_N
.
at
({
m
,
0
}))
);
}
float
inv_sum
=
float
(
1.0
f
/
sum
);
for
(
int
n
=
0
;
n
<
options
.
problem_size
.
n
();
++
n
)
{
view_Softmax_Ref
.
ref
().
at
({
m
,
n
})
=
ElementSoftmax
(
std
::
exp
(
float
(
view_Ref
.
ref
().
at
({
m
,
n
}))
-
float
(
reference_N
.
at
({
m
,
0
}))
)
*
inv_sum
);
}
}
// Verification checks - set any of these to 'true' to override the verification checks.
bool
verified_D
=
true
;
bool
verified_Softmax
=
true
;
// Verify softmax output
if
(
!
verified_D
)
{
verified_D
=
verify_tensor
<
ElementC
>
(
matrix_D
,
matrix_D_Ref
);
}
if
(
!
verified_Softmax
)
{
verified_Softmax
=
verify_tensor
<
ElementSoftmax
>
(
matrix_Softmax
,
matrix_Softmax_Ref
);
}
if
(
!
verified_D
||
!
verified_Softmax
)
{
std
::
cerr
<<
"Verification check failed for tensor Softmax at batch "
<<
batch_idx
<<
"
\n
"
;
// Summarize which checks failed
if
(
!
verified_D
)
{
std
::
cerr
<<
"Verification of D tensor failed
\n
"
;
}
if
(
!
verified_Softmax
)
{
std
::
cerr
<<
"Verification of Softmax tensor failed
\n
"
;
}
return
false
;
}
}
return
true
;
}
/// Profiles
bool
profile
()
{
//
// Profile
//
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
;
hipError_t
result
;
hipEvent_t
events
[
2
];
int
const
kIterations
=
options
.
iterations
;
for
(
hipEvent_t
&
evt
:
events
)
{
result
=
hipEventCreate
(
&
evt
);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
}
result
=
hipEventRecord
(
events
[
0
]);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
for
(
int
iter
=
0
;
iter
<
kIterations
;
++
iter
)
{
status
=
execute_device_kernel
();
if
(
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Device execution failed."
<<
std
::
endl
;
return
false
;
}
}
result
=
hipEventRecord
(
events
[
1
]);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
result
=
hipDeviceSynchronize
();
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipDeviceSynchronize() failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
float
elapsed_ms
=
0
;
result
=
hipEventElapsedTime
(
&
elapsed_ms
,
events
[
0
],
events
[
1
]);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsedTime() failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
for
(
hipEvent_t
&
evt
:
events
)
{
result
=
hipEventDestroy
(
evt
);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventDestroy() failed with error "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
}
int64_t
flops
=
int64_t
(
options
.
problem_size
.
m
())
*
options
.
problem_size
.
n
()
*
options
.
problem_size
.
k
()
*
2
;
int64_t
bytes
=
(
sizeof
(
ElementD
)
*
2
+
sizeof
(
ElementSoftmax
))
*
options
.
problem_size
.
m
()
*
options
.
problem_size
.
n
();
double
gflops_per_second
=
double
(
flops
)
*
kIterations
*
options
.
batch_count
/
double
(
elapsed_ms
/
1000.0
f
)
/
double
(
1.0e9
);
double
gbytes_per_second
=
double
(
bytes
)
*
kIterations
*
options
.
batch_count
/
double
(
elapsed_ms
/
1000.0
f
)
/
double
(
1
<<
30
);
double
elapsed_ms_per_iter
=
double
(
elapsed_ms
)
/
kIterations
;
std
::
cout
<<
" Problem: "
<<
options
.
problem_size
.
m
()
<<
"-by-"
<<
options
.
problem_size
.
n
()
<<
"-by-"
<<
options
.
problem_size
.
k
()
<<
", batch size: "
<<
options
.
batch_count
<<
std
::
endl
;
std
::
cout
<<
" Runtime: "
<<
elapsed_ms_per_iter
<<
" ms
\n
"
<<
std
::
endl
;
std
::
cout
<<
" GFLOPs: "
<<
gflops_per_second
<<
" GFLOPs"
<<
std
::
endl
;
std
::
cout
<<
"Memory bandwidth: "
<<
gbytes_per_second
<<
" GiB/s"
<<
std
::
endl
;
return
true
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
const
char
**
argv
)
{
// Options parsing
Options
options
;
options
.
parse
(
argc
,
argv
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
if
(
!
options
.
supported
())
{
return
0
;
}
// Run
Testbed
testbed
(
options
);
Disposition
disposition
=
testbed
.
run
();
std
::
cout
<<
std
::
endl
;
switch
(
disposition
)
{
case
Disposition
::
kPassed
:
std
::
cout
<<
"Passed"
<<
std
::
endl
;
break
;
case
Disposition
::
kIncorrect
:
std
::
cout
<<
"Incorrect"
<<
std
::
endl
;
break
;
case
Disposition
::
kNotVerified
:
std
::
cout
<<
"Not verified"
<<
std
::
endl
;
break
;
}
return
(
disposition
==
Disposition
::
kPassed
?
0
:
-
1
);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/16_hytlass_gemm_softmax/gemm_with_epilogue_visitor.h
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief GEMM kernel to support the epilogue visitor model
for customized softmax partial reduction epilogue fusion.
This source file will likely be moved to `include/hytlass/gemm/kernel/` in the future once
its usage has been stabilized. For now, it is included in this example to demonstrate
some basic output fusion options.
*/
#pragma once
#include "hytlass/hytlass.h"
#include "hytlass/tensor_ref.h"
#include "hytlass/fast_math.h"
#include "hytlass/gemm/gemm.h"
#include "hytlass/matrix_coord.h"
#include "hytlass/complex.h"
#include "hytlass/semaphore.h"
#include "hytlass/trace.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace
hytlass
{
namespace
gemm
{
namespace
kernel
{
/////////////////////////////////////////////////////////////////////////////////////////////////
template
<
typename
Mma_
,
///! Threadblock-scoped matrix multiply-accumulate
typename
Epilogue_
,
///! Epilogue
typename
ThreadblockSwizzle_
///! Threadblock swizzling function
>
struct
GemmWithEpilogueVisitor
{
public:
using
Mma
=
Mma_
;
using
Epilogue
=
Epilogue_
;
using
EpilogueVisitor
=
typename
Epilogue
::
Visitor
;
using
ThreadblockSwizzle
=
ThreadblockSwizzle_
;
using
ElementA
=
typename
Mma
::
IteratorA
::
Element
;
using
LayoutA
=
typename
Mma
::
IteratorA
::
Layout
;
using
TensorRefA
=
TensorRef
<
ElementA
,
LayoutA
>
;
using
ElementB
=
typename
Mma
::
IteratorB
::
Element
;
using
LayoutB
=
typename
Mma
::
IteratorB
::
Layout
;
using
TensorRefB
=
TensorRef
<
ElementB
,
LayoutB
>
;
using
ElementC
=
typename
EpilogueVisitor
::
ElementOutput
;
using
LayoutC
=
typename
Epilogue
::
Layout
;
using
TensorRefC
=
TensorRef
<
ElementC
,
LayoutC
>
;
static
ComplexTransform
const
kTransformA
=
ComplexTransform
::
kNone
;
static
ComplexTransform
const
kTransformB
=
ComplexTransform
::
kNone
;
using
Operator
=
typename
Mma
::
Operator
;
using
OperatorClass
=
typename
Mma
::
Operator
::
OperatorClass
;
using
ThreadblockShape
=
typename
Mma
::
Shape
;
using
WarpShape
=
typename
Mma
::
Operator
::
Shape
;
using
InstructionShape
=
typename
Mma
::
Policy
::
Operator
::
InstructionShape
;
using
ArchTag
=
typename
Mma
::
ArchTag
;
using
ElementNorm
=
typename
EpilogueVisitor
::
ElementNorm
;
using
ElementSum
=
typename
EpilogueVisitor
::
ElementSum
;
static
int
const
kStages
=
Mma
::
kStages
;
static
int
const
kAlignmentA
=
Mma
::
IteratorA
::
AccessType
::
kElements
;
static
int
const
kAlignmentB
=
Mma
::
IteratorB
::
AccessType
::
kElements
;
static
int
const
kAlignmentC
=
EpilogueVisitor
::
kElementsPerAccess
;
/// Warp count (concept: GemmShape)
using
WarpCount
=
typename
Mma
::
WarpCount
;
static
int
const
kThreadCount
=
WARP_SIZE_GPU
*
WarpCount
::
kCount
;
/// Split-K preserves splits that are 128b aligned
static
int
const
kSplitKAlignment
=
const_max
(
128
/
sizeof_bits
<
ElementA
>::
value
,
128
/
sizeof_bits
<
ElementB
>::
value
);
//
// Structures
//
/// Argument structure
struct
Arguments
{
//
// Data members
//
GemmUniversalMode
mode
;
GemmCoord
problem_size
;
int
batch_count
;
TensorRefA
ref_A
;
TensorRefB
ref_B
;
TensorRefC
ref_C
;
TensorRefC
ref_D
;
ElementNorm
*
ptr_Max
;
ElementSum
*
ptr_Sum
;
int64_t
batch_stride_A
;
int64_t
batch_stride_B
;
typename
EpilogueVisitor
::
Arguments
epilogue_visitor
;
//
// Methods
//
Arguments
()
:
mode
(
GemmUniversalMode
::
kGemm
),
batch_count
(
1
)
{
}
/// constructs an arguments structure
Arguments
(
GemmUniversalMode
mode_
,
GemmCoord
problem_size_
,
int
batch_count_
,
TensorRefA
ref_A_
,
TensorRefB
ref_B_
,
TensorRefC
ref_C_
,
TensorRefC
ref_D_
,
ElementNorm
*
ptr_Max_
,
ElementSum
*
ptr_Sum_
,
int64_t
batch_stride_A_
,
int64_t
batch_stride_B_
,
typename
EpilogueVisitor
::
Arguments
epilogue_visitor_
)
:
mode
(
mode_
),
problem_size
(
problem_size_
),
batch_count
(
batch_count_
),
ref_A
(
ref_A_
),
ref_B
(
ref_B_
),
ref_C
(
ref_C_
),
ref_D
(
ref_D_
),
ptr_Max
(
ptr_Max_
),
ptr_Sum
(
ptr_Sum_
),
batch_stride_A
(
batch_stride_A_
),
batch_stride_B
(
batch_stride_B_
),
epilogue_visitor
(
epilogue_visitor_
)
{
}
};
//
// Structure for precomputing values in host memory and passing to kernels
//
/// Parameters structure
struct
Params
{
hytlass
::
gemm
::
GemmCoord
problem_size
;
hytlass
::
gemm
::
GemmCoord
grid_tiled_shape
;
int
swizzle_log_tile
;
typename
Mma
::
IteratorA
::
Params
params_A
;
typename
Mma
::
IteratorB
::
Params
params_B
;
typename
EpilogueVisitor
::
OutputTileIterator
::
Params
params_C
;
typename
EpilogueVisitor
::
OutputTileIterator
::
Params
params_D
;
GemmUniversalMode
mode
;
int
batch_count
;
int
gemm_k_size
;
void
*
ptr_A
;
void
*
ptr_B
;
ElementC
*
ptr_C
;
ElementC
*
ptr_D
;
ElementNorm
*
ptr_Max
;
ElementSum
*
ptr_Sum
;
int64_t
batch_stride_A
;
int64_t
batch_stride_B
;
typename
EpilogueVisitor
::
Params
epilogue_visitor
;
//
// Methods
//
HYTLASS_HOST_DEVICE
Params
()
:
swizzle_log_tile
(
0
),
params_A
(
0
),
params_B
(
0
),
params_C
(
0
),
params_D
(
0
),
batch_count
(
0
),
gemm_k_size
(
0
),
mode
(
hytlass
::
gemm
::
GemmUniversalMode
::
kGemm
),
ptr_A
(
nullptr
),
ptr_B
(
nullptr
),
ptr_C
(
nullptr
),
ptr_D
(
nullptr
),
ptr_Max
(
nullptr
),
ptr_Sum
(
nullptr
),
batch_stride_A
(
0
),
batch_stride_B
(
0
)
{
}
Params
(
Arguments
const
&
args
)
:
problem_size
(
args
.
problem_size
),
swizzle_log_tile
(
0
),
params_A
(
args
.
ref_A
.
layout
()),
params_B
(
args
.
ref_B
.
layout
()),
params_C
(
args
.
ref_C
.
layout
()),
params_D
(
args
.
ref_D
.
layout
()),
mode
(
args
.
mode
),
batch_count
(
args
.
batch_count
),
gemm_k_size
(
args
.
problem_size
.
k
()),
ptr_A
(
args
.
ref_A
.
data
()),
ptr_B
(
args
.
ref_B
.
data
()),
ptr_C
(
args
.
ref_C
.
data
()),
ptr_D
(
args
.
ref_D
.
data
()),
ptr_Max
(
args
.
ptr_Max
),
ptr_Sum
(
args
.
ptr_Sum
),
batch_stride_A
(
args
.
batch_stride_A
),
batch_stride_B
(
args
.
batch_stride_B
),
epilogue_visitor
(
args
.
epilogue_visitor
)
{
ThreadblockSwizzle
threadblock_swizzle
;
grid_tiled_shape
=
threadblock_swizzle
.
get_tiled_shape
(
args
.
problem_size
,
{
ThreadblockShape
::
kM
,
ThreadblockShape
::
kN
,
ThreadblockShape
::
kK
},
args
.
batch_count
);
if
(
args
.
mode
==
GemmUniversalMode
::
kGemm
||
args
.
mode
==
GemmUniversalMode
::
kGemmSplitKParallel
)
{
int
const
kAlignK
=
const_max
(
const_max
(
128
/
sizeof_bits
<
ElementA
>::
value
,
128
/
sizeof_bits
<
ElementB
>::
value
),
1
);
gemm_k_size
=
round_up
(
ceil_div
(
args
.
problem_size
.
k
(),
args
.
batch_count
),
kAlignK
);
if
(
gemm_k_size
)
{
grid_tiled_shape
.
k
()
=
ceil_div
(
args
.
problem_size
.
k
(),
gemm_k_size
);
}
}
swizzle_log_tile
=
threadblock_swizzle
.
get_log_tile
(
grid_tiled_shape
);
}
};
/// Shared memory storage structure
union
SharedStorage
{
typename
Mma
::
SharedStorage
main_loop
;
struct
{
typename
Epilogue
::
SharedStorage
epilogue
;
typename
EpilogueVisitor
::
SharedStorage
visitor
;
}
epilogue
;
};
public:
//
// Methods
//
HYTLASS_DEVICE
GemmWithEpilogueVisitor
()
{
}
/// Determines whether kernel satisfies alignment
static
Status
can_implement
(
hytlass
::
gemm
::
GemmCoord
const
&
problem_size
)
{
HYTLASS_TRACE_HOST
(
"GemmWithEpilogueVisitor::can_implement()"
);
static
int
const
kAlignmentA
=
Mma
::
IteratorA
::
AccessType
::
kElements
;
static
int
const
kAlignmentB
=
Mma
::
IteratorB
::
AccessType
::
kElements
;
static
int
const
kAlignmentC
=
Epilogue
::
OutputTileIterator
::
kElementsPerAccess
;
bool
isAMisaligned
=
false
;
bool
isBMisaligned
=
false
;
bool
isCMisaligned
=
false
;
if
(
platform
::
is_same
<
LayoutA
,
layout
::
RowMajor
>::
value
)
{
isAMisaligned
=
problem_size
.
k
()
%
kAlignmentA
;
}
else
if
(
platform
::
is_same
<
LayoutA
,
layout
::
ColumnMajor
>::
value
)
{
isAMisaligned
=
problem_size
.
m
()
%
kAlignmentA
;
}
else
if
(
platform
::
is_same
<
LayoutA
,
layout
::
ColumnMajorInterleaved
<
32
>>::
value
||
platform
::
is_same
<
LayoutA
,
layout
::
ColumnMajorInterleaved
<
64
>>::
value
)
{
isAMisaligned
=
problem_size
.
k
()
%
kAlignmentA
;
}
if
(
platform
::
is_same
<
LayoutB
,
layout
::
RowMajor
>::
value
)
{
isBMisaligned
=
problem_size
.
n
()
%
kAlignmentB
;
}
else
if
(
platform
::
is_same
<
LayoutB
,
layout
::
ColumnMajor
>::
value
)
{
isBMisaligned
=
problem_size
.
k
()
%
kAlignmentB
;
}
else
if
(
platform
::
is_same
<
LayoutB
,
layout
::
RowMajorInterleaved
<
32
>>::
value
||
platform
::
is_same
<
LayoutB
,
layout
::
RowMajorInterleaved
<
64
>>::
value
)
{
isBMisaligned
=
problem_size
.
k
()
%
kAlignmentB
;
}
if
(
platform
::
is_same
<
LayoutC
,
layout
::
RowMajor
>::
value
)
{
isCMisaligned
=
problem_size
.
n
()
%
kAlignmentC
;
}
else
if
(
platform
::
is_same
<
LayoutC
,
layout
::
ColumnMajor
>::
value
)
{
isCMisaligned
=
problem_size
.
m
()
%
kAlignmentC
;
}
else
if
(
platform
::
is_same
<
LayoutC
,
layout
::
ColumnMajorInterleaved
<
32
>>::
value
||
platform
::
is_same
<
LayoutC
,
layout
::
ColumnMajorInterleaved
<
64
>>::
value
)
{
isCMisaligned
=
problem_size
.
n
()
%
kAlignmentC
;
}
if
(
isAMisaligned
)
{
HYTLASS_TRACE_HOST
(
" returning kErrorMisalignedOperand for A operand"
);
return
Status
::
kErrorMisalignedOperand
;
}
if
(
isBMisaligned
)
{
HYTLASS_TRACE_HOST
(
" returning kErrorMisalignedOperand for B operand"
);
return
Status
::
kErrorMisalignedOperand
;
}
if
(
isCMisaligned
)
{
HYTLASS_TRACE_HOST
(
" returning kErrorMisalignedOperand for C operand"
);
return
Status
::
kErrorMisalignedOperand
;
}
HYTLASS_TRACE_HOST
(
" returning kSuccess"
);
return
Status
::
kSuccess
;
}
static
Status
can_implement
(
Arguments
const
&
args
)
{
return
can_implement
(
args
.
problem_size
);
}
#define SPLIT_K_ENABLED 1
/// Executes one GEMM
HYTLASS_DEVICE
void
operator
()(
Params
const
&
params
,
SharedStorage
&
shared_storage
)
{
// Compute threadblock location
ThreadblockSwizzle
threadblock_swizzle
;
hytlass
::
gemm
::
GemmCoord
threadblock_tile_offset
=
threadblock_swizzle
.
get_tile_offset
(
params
.
swizzle_log_tile
);
// Early exit if CTA is out of range
if
(
params
.
grid_tiled_shape
.
m
()
<=
threadblock_tile_offset
.
m
()
||
params
.
grid_tiled_shape
.
n
()
<=
threadblock_tile_offset
.
n
())
{
return
;
}
int
offset_k
=
0
;
int
problem_size_k
=
params
.
problem_size
.
k
();
ElementA
*
ptr_A
=
static_cast
<
ElementA
*>
(
params
.
ptr_A
);
ElementB
*
ptr_B
=
static_cast
<
ElementB
*>
(
params
.
ptr_B
);
#if SPLIT_K_ENABLED
//
// Fetch pointers based on mode.
//
if
(
params
.
mode
==
GemmUniversalMode
::
kGemm
||
params
.
mode
==
GemmUniversalMode
::
kGemmSplitKParallel
)
{
if
(
threadblock_tile_offset
.
k
()
+
1
<
params
.
grid_tiled_shape
.
k
())
{
problem_size_k
=
(
threadblock_tile_offset
.
k
()
+
1
)
*
params
.
gemm_k_size
;
}
offset_k
=
threadblock_tile_offset
.
k
()
*
params
.
gemm_k_size
;
}
else
if
(
params
.
mode
==
GemmUniversalMode
::
kBatched
)
{
ptr_A
+=
threadblock_tile_offset
.
k
()
*
params
.
batch_stride_A
;
ptr_B
+=
threadblock_tile_offset
.
k
()
*
params
.
batch_stride_B
;
}
else
if
(
params
.
mode
==
GemmUniversalMode
::
kArray
)
{
ptr_A
=
static_cast
<
ElementA
*
const
*>
(
params
.
ptr_A
)[
threadblock_tile_offset
.
k
()];
ptr_B
=
static_cast
<
ElementB
*
const
*>
(
params
.
ptr_B
)[
threadblock_tile_offset
.
k
()];
}
#endif
// Compute initial location in logical coordinates
hytlass
::
MatrixCoord
tb_offset_A
{
threadblock_tile_offset
.
m
()
*
Mma
::
Shape
::
kM
,
offset_k
,
};
hytlass
::
MatrixCoord
tb_offset_B
{
offset_k
,
threadblock_tile_offset
.
n
()
*
Mma
::
Shape
::
kN
};
// Compute position within threadblock
int
thread_idx
=
threadIdx
.
x
;
// Construct iterators to A and B operands
typename
Mma
::
IteratorA
iterator_A
(
params
.
params_A
,
ptr_A
,
{
params
.
problem_size
.
m
(),
problem_size_k
},
thread_idx
,
tb_offset_A
);
typename
Mma
::
IteratorB
iterator_B
(
params
.
params_B
,
ptr_B
,
{
problem_size_k
,
params
.
problem_size
.
n
()},
thread_idx
,
tb_offset_B
);
// Broadcast the warp_id computed by lane 0 to ensure dependent code
// is compiled as warp-uniform.
int
warp_idx
=
threadIdx
.
x
/
WARP_SIZE_GPU
;
int
lane_idx
=
threadIdx
.
x
%
WARP_SIZE_GPU
;
//
// Main loop
//
// Construct thread-scoped matrix multiply
Mma
mma
(
shared_storage
.
main_loop
,
thread_idx
,
warp_idx
,
lane_idx
);
typename
Mma
::
FragmentC
accumulators
;
accumulators
.
clear
();
// Compute threadblock-scoped matrix multiply-add
int
gemm_k_iterations
=
(
problem_size_k
-
offset_k
+
Mma
::
Shape
::
kK
-
1
)
/
Mma
::
Shape
::
kK
;
// Compute threadblock-scoped matrix multiply-add
mma
(
gemm_k_iterations
,
accumulators
,
iterator_A
,
iterator_B
,
accumulators
);
//
// Masked tile iterators constructed from members
//
threadblock_tile_offset
=
threadblock_swizzle
.
get_tile_offset
(
params
.
swizzle_log_tile
);
//assume identity swizzle
MatrixCoord
threadblock_offset
(
threadblock_tile_offset
.
m
()
*
Mma
::
Shape
::
kM
,
threadblock_tile_offset
.
n
()
*
Mma
::
Shape
::
kN
);
int
block_idx
=
threadblock_tile_offset
.
m
()
+
threadblock_tile_offset
.
n
()
*
params
.
grid_tiled_shape
.
m
();
//
// Construct the epilogue visitor
//
EpilogueVisitor
epilogue_visitor
(
params
.
epilogue_visitor
,
shared_storage
.
epilogue
.
visitor
,
params
.
problem_size
.
mn
(),
thread_idx
,
warp_idx
,
lane_idx
,
params
.
params_C
,
params
.
params_D
,
params
.
ptr_C
,
params
.
ptr_D
,
params
.
ptr_Max
,
params
.
ptr_Sum
,
threadblock_offset
,
blockIdx
.
y
*
params
.
problem_size
.
m
()
);
if
(
params
.
mode
==
GemmUniversalMode
::
kGemm
)
{
// Indicate which position in a serial reduction the output operator is currently updating
epilogue_visitor
.
set_k_partition
(
threadblock_tile_offset
.
k
(),
params
.
grid_tiled_shape
.
k
());
}
else
if
(
params
.
mode
==
GemmUniversalMode
::
kBatched
||
params
.
mode
==
GemmUniversalMode
::
kArray
)
{
epilogue_visitor
.
set_batch_index
(
threadblock_tile_offset
.
k
());
}
// Construct the epilogue
Epilogue
epilogue
(
shared_storage
.
epilogue
.
epilogue
,
thread_idx
,
warp_idx
,
lane_idx
);
// Execute the epilogue operator to update the destination tensor.
epilogue
(
epilogue_visitor
,
accumulators
);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
}
// namespace kernel
}
// namespace gemm
}
// namespace hytlass
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/16_hytlass_gemm_softmax/gemm_with_softmax.h
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
*/
#pragma once
/////////////////////////////////////////////////////////////////////////////////////////////////
#include <cmath>
#include <iostream>
#include <vector>
#include <limits>
#include "hytlass/hytlass.h"
#include "hytlass/device_kernel.h"
#include "hytlass/arch/memory.h"
#include "hytlass/arch/memory_gfx928.h"
#include "hytlass/gemm/kernel/default_gemm.h"
#include "hytlass/gemm/device/default_gemm_configuration.h"
#include "hytlass/epilogue/threadblock/epilogue_visitor_with_softmax.h"
#include "hytlass/epilogue/threadblock/epilogue_with_visitor.h"
#include "hytlass/reduction/kernel/reduce_softmax_final.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
#include "gemm_with_epilogue_visitor.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace
hytlass
{
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace
kernel
{
/////////////////////////////////////////////////////////////////////////////////////////////////
//
// Kernel computes partial reduction
//
//
// 2. Sum[m, n'] = sum_n(exp(D[m, n] - N[m, 0]))
//
template
<
typename
ElementD_
,
typename
ElementNorm_
,
typename
ElementSum_
,
typename
ElementSoft_
,
typename
ElementSoftmaxCompute_
,
int
Alignment
,
typename
ApplyShape_
=
MatrixShape
<
1
,
1024
>
>
class
ApplySoftmax
{
public:
using
ElementD
=
ElementD_
;
using
ElementNorm
=
ElementNorm_
;
using
ElementSum
=
ElementSum_
;
using
ElementSoft
=
ElementSoft_
;
using
ElementSoftmaxCompute
=
ElementSoftmaxCompute_
;
static
int
const
kAlignment
=
Alignment
;
using
ApplyShape
=
ApplyShape_
;
using
Layout
=
hytlass
::
layout
::
RowMajor
;
using
TensorRefD
=
TensorRef
<
ElementD
,
Layout
>
;
using
TensorRefN
=
TensorRef
<
ElementNorm
,
Layout
>
;
using
TensorRefSum
=
TensorRef
<
ElementSum
,
Layout
>
;
using
TensorRefSoft
=
TensorRef
<
ElementSoft
,
Layout
>
;
using
FragmentSoftmax
=
Array
<
ElementSoftmaxCompute
,
kAlignment
>
;
//
// Arguments
//
struct
Arguments
{
MatrixCoord
extent
;
///< Extent of D and Softmax matrices
int
batch_count
;
///< Batch count
TensorRefD
ref_D
;
///< D matrix computed by GEMM+Max (input)
TensorRefN
ref_N
;
///< Norm tensor (input)
TensorRefSum
ref_S
;
///< Sum tensor (input)
TensorRefSoft
ref_Soft
;
///< Softmax tensor (output)
int64_t
batch_stride_D
;
///< Batch stride for D tensor
int64_t
batch_stride_N
;
///< Batch stride for N tensor
int64_t
batch_stride_S
;
///< Batch stride for S tensor
int64_t
batch_stride_Soft
;
///< Batch stride for softmax tensor
//
// Methods
//
Arguments
()
:
batch_count
(
1
),
batch_stride_D
(
0
),
batch_stride_N
(
0
),
batch_stride_S
(
0
),
batch_stride_Soft
(
0
)
{
}
Arguments
(
MatrixCoord
extent_
,
///< Extent of D and Softmax matrices
int
batch_count_
,
///< Batch count
TensorRefD
ref_D_
,
///< D matrix computed by GEMM+PartialReduce
TensorRefN
ref_N_
,
///< Output parameter for N
TensorRefSum
ref_S_
,
///< Output parameter for N
TensorRefSoft
ref_Soft_
,
///< Softmax
int64_t
batch_stride_D_
=
0
,
int64_t
batch_stride_N_
=
0
,
int64_t
batch_stride_S_
=
0
,
int64_t
batch_stride_Soft_
=
0
)
:
extent
(
extent_
),
batch_count
(
batch_count_
),
ref_D
(
ref_D_
),
ref_N
(
ref_N_
),
ref_S
(
ref_S_
),
ref_Soft
(
ref_Soft_
),
batch_stride_D
(
batch_stride_D_
),
batch_stride_N
(
batch_stride_N_
),
batch_stride_S
(
batch_stride_S_
),
batch_stride_Soft
(
batch_stride_Soft_
)
{
}
};
//
// Params struct
//
struct
Params
{
Arguments
args
;
//
// Methods
//
Params
()
{
}
Params
(
Arguments
const
&
args_
)
:
args
(
args_
)
{
}
};
//
// SharedStorage
//
struct
SharedStorage
{
};
private:
public:
HYTLASS_DEVICE
ApplySoftmax
()
{
}
HYTLASS_DEVICE
void
operator
()(
Params
const
&
params
,
SharedStorage
&
shared_storage
)
{
apply
(
params
,
shared_storage
);
}
private:
/// Compute Softmax
HYTLASS_DEVICE
void
apply
(
Params
const
&
params
,
SharedStorage
&
shared_storage
)
{
using
AccessTypeD
=
AlignedArray
<
ElementD
,
kAlignment
>
;
int
block_batch
=
blockIdx
.
z
;
int
block_m
=
blockIdx
.
x
*
ApplyShape
::
kRow
;
int
block_n
=
0
;
int
thread_m
=
threadIdx
.
y
;
int
thread_n
=
threadIdx
.
x
*
kAlignment
;
int
idx_m
=
block_m
+
thread_m
;
int
idx_n
=
block_n
+
thread_n
;
int
batch_offset_norm
=
block_batch
*
params
.
args
.
batch_stride_N
;
int
batch_offset_sum
=
block_batch
*
params
.
args
.
batch_stride_S
;
// Kill off thread if it is outside the row boundary
if
(
params
.
args
.
extent
.
row
()
<=
idx_m
)
{
return
;
}
//
// Setup pointers to load D again
//
using
AccessTypeD
=
AlignedArray
<
ElementD
,
kAlignment
>
;
using
AccessTypeSoft
=
AlignedArray
<
ElementSoft
,
kAlignment
>
;
using
FragmentSoft
=
Array
<
ElementSoft
,
kAlignment
>
;
using
ConvertSoftCompute
=
hytlass
::
NumericArrayConverter
<
ElementSoftmaxCompute
,
ElementD
,
kAlignment
>
;
using
ConvertSoftOutput
=
hytlass
::
NumericArrayConverter
<
ElementSoft
,
ElementSoftmaxCompute
,
kAlignment
>
;
using
Mul
=
hytlass
::
multiplies
<
FragmentSoftmax
>
;
using
Minus
=
hytlass
::
minus
<
FragmentSoftmax
>
;
using
Exp
=
hytlass
::
fast_exp_op
<
FragmentSoftmax
>
;
ConvertSoftCompute
convert_soft_compute
;
ConvertSoftOutput
convert_soft_output
;
Minus
minus
;
Mul
mul
;
Exp
exponential
;
using
ConvertSum
=
hytlass
::
NumericConverter
<
ElementSoftmaxCompute
,
ElementSum
>
;
using
ConvertNorm
=
hytlass
::
NumericConverter
<
ElementSoftmaxCompute
,
ElementNorm
>
;
ConvertSum
convert_sum
;
ConvertNorm
convert_norm
;
AccessTypeD
*
access_d
=
reinterpret_cast
<
AccessTypeD
*>
(
params
.
args
.
ref_D
.
data
()
+
params
.
args
.
batch_stride_D
*
block_batch
+
params
.
args
.
ref_D
.
layout
()({
idx_m
,
idx_n
}));
AccessTypeSoft
*
access_soft
=
reinterpret_cast
<
AccessTypeSoft
*>
(
params
.
args
.
ref_Soft
.
data
()
+
params
.
args
.
batch_stride_Soft
*
block_batch
+
params
.
args
.
ref_Soft
.
layout
()({
idx_m
,
idx_n
}));
ElementSum
inv_sum
=
(
params
.
args
.
ref_S
.
data
())[
idx_m
+
batch_offset_sum
];
ElementNorm
norm
=
(
params
.
args
.
ref_N
.
data
())[
idx_m
+
batch_offset_norm
];
//
// Loop
//
HYTLASS_PRAGMA_UNROLL
for
(
int
idx
=
0
;
idx
<
params
.
args
.
extent
.
column
();
idx
+=
ApplyShape
::
kColumn
*
kAlignment
)
{
if
(
idx_n
<
params
.
args
.
extent
.
column
())
{
AccessTypeD
fetch
;
arch
::
global_load
<
AccessTypeD
,
sizeof
(
AccessTypeD
)
>
(
fetch
,
access_d
,
true
);
FragmentSoftmax
result
=
mul
(
exponential
(
minus
(
convert_soft_compute
(
fetch
),
convert_norm
(
norm
))),
convert_sum
(
inv_sum
));
FragmentSoft
soft
=
convert_soft_output
(
result
);
arch
::
global_store
<
FragmentSoft
,
sizeof
(
FragmentSoft
)
>
(
soft
,
access_soft
,
true
);
}
access_d
+=
ApplyShape
::
kColumn
;
access_soft
+=
ApplyShape
::
kColumn
;
idx_n
+=
ApplyShape
::
kColumn
*
kAlignment
;
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
}
// namespace kernel
/////////////////////////////////////////////////////////////////////////////////////////////////
///
template
<
typename
ElementA_
,
typename
LayoutA_
,
typename
ElementB_
,
typename
LayoutB_
,
typename
ElementC_
,
typename
ElementCompute_
,
typename
OperatorClass_
,
typename
ArchTag_
,
typename
ThreadblockShape_
,
typename
WarpShape_
,
typename
InstructionShape_
,
typename
EpilogueFunctorOp_
,
int
kStages_
,
typename
ApplyShape_
=
MatrixShape
<
1
,
1024
>,
int
AlignmentA_
=
128
/
hytlass
::
sizeof_bits
<
ElementA_
>::
value
,
int
AlignmentB_
=
128
/
hytlass
::
sizeof_bits
<
ElementB_
>::
value
,
int
AlignmentSoftmax_
=
128
/
hytlass
::
sizeof_bits
<
ElementC_
>::
value
,
typename
ElementNorm_
=
float
,
typename
ElementSum_
=
float
,
typename
ElementSoftmax_
=
ElementC_
>
class
GemmSoftmax
{
public:
///////////////////////////////////////////////////////////////////////////////////////////////
//
// Type definitions
//
using
ElementA
=
ElementA_
;
using
ElementB
=
ElementB_
;
using
ElementC
=
ElementC_
;
using
ElementCompute
=
ElementCompute_
;
using
ElementSum
=
ElementSum_
;
using
ElementSoft
=
ElementSoftmax_
;
using
ElementSoftmaxCompute
=
float
;
using
LayoutA
=
LayoutA_
;
using
LayoutB
=
LayoutB_
;
using
EpilogueFunctorOp
=
EpilogueFunctorOp_
;
using
ElementNorm
=
ElementNorm_
;
using
ApplyShape
=
ApplyShape_
;
// These are mandatory layouts.
using
LayoutC
=
hytlass
::
layout
::
RowMajor
;
using
LayoutN
=
hytlass
::
layout
::
RowMajor
;
using
LayoutS
=
hytlass
::
layout
::
RowMajor
;
using
LayoutSoft
=
hytlass
::
layout
::
RowMajor
;
using
TensorRefA
=
TensorRef
<
ElementA
,
LayoutA
>
;
using
TensorRefB
=
TensorRef
<
ElementB
,
LayoutB
>
;
using
TensorRefC
=
TensorRef
<
ElementC
,
LayoutC
>
;
using
TensorRefN
=
TensorRef
<
ElementNorm
,
LayoutN
>
;
using
TensorRefSum
=
TensorRef
<
ElementSum
,
LayoutS
>
;
using
TensorRefSoft
=
TensorRef
<
ElementSoft
,
LayoutSoft
>
;
using
ThreadblockShape
=
ThreadblockShape_
;
using
WarpShape
=
WarpShape_
;
using
InstructionShape
=
InstructionShape_
;
using
OperatorClass
=
OperatorClass_
;
using
ArchTag
=
ArchTag_
;
static
int
const
kStages
=
kStages_
;
static
int
const
AlignmentA
=
AlignmentA_
;
static
int
const
AlignmentB
=
AlignmentB_
;
static
int
const
AlignmentSoftmax
=
AlignmentSoftmax_
;
using
ThreadblockSwizzle
=
hytlass
::
gemm
::
threadblock
::
GemmBatchedIdentityThreadblockSwizzle
;
///////////////////////////////////////////////////////////////////////////////////////////////
// basic GEMM kernel
using
DefaultGemmKernel
=
typename
hytlass
::
gemm
::
kernel
::
DefaultGemm
<
ElementA
,
LayoutA
,
AlignmentA
,
ElementB
,
LayoutB
,
AlignmentB
,
ElementC
,
LayoutC
,
ElementCompute
,
OperatorClass
,
ArchTag
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueFunctorOp
,
ThreadblockSwizzle
,
kStages
,
true
,
typename
hytlass
::
gemm
::
device
::
DefaultGemmConfiguration
<
OperatorClass
,
ArchTag
,
ElementA
,
ElementB
,
ElementC
,
ElementCompute
>::
Operator
,
hytlass
::
gemm
::
SharedMemoryClearOption
::
kNone
>::
GemmKernel
;
///////////////////////////////////////////////////////////////////////////////////////////////
// Epilogue visitor
using
EpilogueVisitor
=
typename
hytlass
::
epilogue
::
threadblock
::
EpilogueVisitorSoftmax
<
ThreadblockShape
,
DefaultGemmKernel
::
kThreadCount
,
typename
DefaultGemmKernel
::
Epilogue
::
OutputTileIterator
,
ElementCompute
,
ElementNorm
,
ElementSum
,
ElementSoftmaxCompute
,
EpilogueFunctorOp
>
;
/// Epilogue
using
Epilogue
=
typename
hytlass
::
epilogue
::
threadblock
::
EpilogueWithVisitorFromExistingEpilogue
<
EpilogueVisitor
,
typename
DefaultGemmKernel
::
Epilogue
>::
Epilogue
;
// GEMM
using
GemmKernel
=
gemm
::
kernel
::
GemmWithEpilogueVisitor
<
typename
DefaultGemmKernel
::
Mma
,
Epilogue
,
ThreadblockSwizzle
>
;
// Softmax kernel
using
SoftmaxApplyKernel
=
kernel
::
ApplySoftmax
<
ElementC
,
ElementNorm
,
ElementSum
,
ElementSoft
,
ElementSoftmaxCompute
,
AlignmentSoftmax
,
ApplyShape
>
;
using
ApplyFinalReductionKernel
=
hytlass
::
reduction
::
kernel
::
ApplySoftmaxFinalReduction
<
ElementNorm
,
ElementSum
,
ElementSoftmaxCompute
,
ThreadblockShape
>
;
public:
/// Arguments class
struct
Arguments
{
typename
GemmKernel
::
Arguments
gemm
;
typename
SoftmaxApplyKernel
::
Arguments
softmax
;
typename
ApplyFinalReductionKernel
::
Arguments
reduction
;
hytlass
::
gemm
::
GemmCoord
extend
;
//
// Methods
//
Arguments
()
{
}
Arguments
(
hytlass
::
gemm
::
GemmCoord
problem_size
,
int32_t
batch_count_
,
TensorRefA
ref_A_
,
TensorRefB
ref_B_
,
TensorRefC
ref_C_
,
TensorRefC
ref_D_
,
typename
EpilogueFunctorOp
::
Params
linear_scaling
,
TensorRefN
ref_N_
,
TensorRefSum
ref_S_
,
TensorRefSoft
ref_Softmax_
,
int64_t
batch_stride_A_
=
0
,
int64_t
batch_stride_B_
=
0
,
int64_t
batch_stride_C_
=
0
,
int64_t
batch_stride_D_
=
0
,
int64_t
batch_stride_Max_
=
0
,
int64_t
batch_stride_Sum_
=
0
,
int64_t
batch_stride_Softmax_
=
0
)
:
gemm
(
hytlass
::
gemm
::
GemmUniversalMode
::
kBatched
,
problem_size
,
batch_count_
,
ref_A_
,
ref_B_
,
ref_C_
,
ref_D_
,
ref_N_
.
data
(),
ref_S_
.
data
(),
batch_stride_A_
,
batch_stride_B_
,
typename
EpilogueVisitor
::
Arguments
(
linear_scaling
,
batch_stride_C_
,
batch_stride_D_
,
batch_stride_Max_
,
batch_stride_Sum_
)
),
reduction
(
problem_size
,
ref_N_
.
data
(),
ref_S_
.
data
(),
batch_stride_Max_
,
batch_stride_Sum_
),
softmax
(
MatrixCoord
(
problem_size
.
m
(),
problem_size
.
n
()),
batch_count_
,
ref_D_
,
ref_N_
,
ref_S_
,
ref_Softmax_
,
batch_stride_D_
,
batch_stride_Max_
,
batch_stride_Sum_
,
batch_stride_Softmax_
),
extend
(
problem_size
)
{
}
};
struct
Params
{
typename
GemmKernel
::
Params
gemm
;
typename
SoftmaxApplyKernel
::
Params
softmax
;
typename
ApplyFinalReductionKernel
::
Params
reduction
;
MatrixCoord
extend
;
//
// Methods
//
Params
()
{
}
Params
(
Arguments
const
&
args
)
:
gemm
(
args
.
gemm
),
reduction
(
args
.
reduction
),
softmax
(
args
.
softmax
),
extend
(
MatrixCoord
(
args
.
extend
.
m
(),
args
.
extend
.
n
()))
{
}
};
public:
// Gemm
//
// Methods
//
private:
Params
params_
;
public:
/// Ctor
GemmSoftmax
()
{
}
/// Initialize
Status
initialize
(
Arguments
const
&
args
)
{
params_
=
Params
(
args
);
return
hytlass
::
Status
::
kSuccess
;
}
/// Run
Status
run
(
hipStream_t
stream
)
{
//
// Launch the GEMM + max kernel
//
dim3
gemm_grid
=
ThreadblockSwizzle
().
get_grid_shape
(
params_
.
gemm
.
grid_tiled_shape
);
dim3
gemm_block
(
GemmKernel
::
kThreadCount
,
1
,
1
);
int
gemm_smem_size
=
int
(
sizeof
(
typename
GemmKernel
::
SharedStorage
));
hipError_t
result
;
if
(
gemm_smem_size
>=
(
48
<<
10
))
{
result
=
hipFuncSetAttribute
((
const
void
*
)
hytlass
::
Kernel
<
GemmKernel
>
,
hipFuncAttributeMaxDynamicSharedMemorySize
,
gemm_smem_size
);
if
(
result
!=
hipSuccess
)
{
return
Status
::
kErrorInternal
;
}
}
hytlass
::
Kernel
<
GemmKernel
><<<
gemm_grid
,
gemm_block
,
gemm_smem_size
,
stream
>>>
(
params_
.
gemm
);
result
=
hipGetLastError
();
if
(
result
!=
hipSuccess
)
{
return
hytlass
::
Status
::
kErrorInternal
;
}
//
// Launch the ApplyFinalReductionKernel
//
int
thread_per_block
=
128
;
int
block_per_row
=
(
params_
.
extend
.
row
()
+
thread_per_block
-
1
)
/
thread_per_block
;
if
(
block_per_row
<
4
)
{
thread_per_block
=
64
;
block_per_row
=
(
params_
.
extend
.
row
()
+
thread_per_block
-
1
)
/
thread_per_block
;
}
dim3
final_reduction_grid
(
block_per_row
,
1
,
params_
.
softmax
.
args
.
batch_count
);
dim3
final_reduction_block
(
thread_per_block
);
Kernel
<
ApplyFinalReductionKernel
><<<
final_reduction_grid
,
final_reduction_block
,
sizeof
(
typename
ApplyFinalReductionKernel
::
SharedStorage
),
stream
>>>
(
params_
.
reduction
);
result
=
hipGetLastError
();
if
(
result
!=
hipSuccess
)
{
return
hytlass
::
Status
::
kErrorInternal
;
}
//
// Launch the SoftmaxApplyKernel
//
dim3
apply_block
(
SoftmaxApplyKernel
::
ApplyShape
::
kColumn
,
SoftmaxApplyKernel
::
ApplyShape
::
kRow
);
int
threadblock_rows
=
SoftmaxApplyKernel
::
ApplyShape
::
kRow
;
int
threadblock_columns
=
SoftmaxApplyKernel
::
ApplyShape
::
kColumn
*
SoftmaxApplyKernel
::
kAlignment
;
dim3
apply_grid
(
(
params_
.
softmax
.
args
.
extent
.
row
()
+
threadblock_rows
-
1
)
/
threadblock_rows
,
(
params_
.
softmax
.
args
.
extent
.
column
()
+
threadblock_columns
-
1
)
/
threadblock_columns
,
params_
.
softmax
.
args
.
batch_count
);
Kernel
<
SoftmaxApplyKernel
><<<
apply_grid
,
apply_block
,
sizeof
(
typename
SoftmaxApplyKernel
::
SharedStorage
),
stream
>>>
(
params_
.
softmax
);
result
=
hipGetLastError
();
if
(
result
!=
hipSuccess
)
{
return
hytlass
::
Status
::
kErrorInternal
;
}
return
hytlass
::
Status
::
kSuccess
;
}
/// Function call operator
Status
operator
()(
hipStream_t
stream
=
nullptr
)
{
return
run
(
stream
);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
}
// namespace hytlass
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/17_ell_block_sparse_gemm/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
ell_block_sparse_gemm
ell_block_sparse_gemm.cu
)
hytlass_example_add_executable
(
ell_block_sparse_gemm_bias_relu
ell_block_sparse_gemm_bias_relu.cu
)
examples/17_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Block-Ell sparse gemm example.
This example performs a Sparse-matrix dense-matrix multiplication (SpMM) operation.
Matrix A is stored in the Blocked-Ellpack (Blocked-ELL) storage format.
Details about the Blocked-Ellpack (Blocked-ELL) storage format can be found here:
https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-spmat-create-blockedell
Whereas matrix B is a dense matrix.
Blocked-Ellpack or Blocked-ELL storage format comprises of two matrices.
First is a packed matrix (ellValue matrix) that stores non-zero values in consecutive blocks,
represented by tensor_a in this example. Second is a matrix of indices (ellColInd matrix),
represented by tensor_ell_idx in this example, that represent the column indices of the
corresponding non-zero blocks. All rows in the matrices must have the same number of blocks.
ellColInd can contain -1 values for indicating empty blocks. These matrices store elements in
row-major order.
Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format
for this example:
a_rows - Rows in the sparse matrix.
a_cols - Colums in the sparse matrix.
a_ell_blocksize - Size of the ELL-Blocks.
a_ell_num_columns - Number of columns in the Blocked-Ellpack format (ellValue columns)
tensor_a - ellValue matrix, whose size is (a_rows * a_ell_num_columns)
tensor_ell_idx - Blocked-ELL Column indices (ellColInd), whose size is
(a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
tensor_b - Input dense matrix whose size is (a_cols * n)
tensor_c/tensor_d - Output dense matrix whose size is (a_rows * n)
{a_rows, n, a_cols} - Problem size
*/
/////////////////////////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <unordered_map>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/gemm.h"
#include "hytlass/gemm/kernel/gemm_grouped.h"
#include "hytlass/gemm/kernel/default_gemm_grouped.h"
#include "hytlass/gemm/device/ell_gemm.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/distribution.h"
#include "hytlass/util/device_memory.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/host/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/tensor_norm.h"
#include "hytlass/util/host_uncompress.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hipError_t
error
;
bool
passed
;
//
// Methods
//
Result
(
double
runtime_ms
=
0
,
double
gflops
=
0
,
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
,
hipError_t
error
=
hipSuccess
)
:
runtime_ms
(
runtime_ms
),
gflops
(
gflops
),
status
(
status
),
error
(
error
),
passed
(
true
)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
bool
reference_check
;
int
iterations
;
int
hip_streams
;
int
a_rows
,
n
,
a_cols
;
int
a_ell_num_columns
;
int
a_ell_blocksize
;
int
a_base
;
float
alpha
;
float
beta
;
//
// Methods
//
Options
()
:
help
(
false
),
reference_check
(
true
),
iterations
(
20
),
hip_streams
(
0
),
a_rows
(
1024
),
n
(
1024
),
a_cols
(
1024
),
a_ell_num_columns
(
512
),
a_ell_blocksize
(
16
),
a_base
(
0
),
alpha
(
1
),
beta
()
{
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
,
1.0
f
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
,
0.0
f
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
,
20
);
cmd
.
get_cmd_line_argument
(
"streams"
,
hip_streams
,
0
);
cmd
.
get_cmd_line_argument
(
"reference-check"
,
reference_check
,
true
);
cmd
.
get_cmd_line_argument
(
"a_rows"
,
a_rows
,
1024
);
cmd
.
get_cmd_line_argument
(
"n"
,
n
,
1024
);
cmd
.
get_cmd_line_argument
(
"a_cols"
,
a_cols
,
1024
);
cmd
.
get_cmd_line_argument
(
"a_ell_num_columns"
,
a_ell_num_columns
,
512
);
cmd
.
get_cmd_line_argument
(
"a_ell_blocksize"
,
a_ell_blocksize
,
16
);
cmd
.
get_cmd_line_argument
(
"a_base"
,
a_base
,
0
);
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"43_ell_block_sparse_gemm
\n\n
"
<<
" This example profiles the performance of a ELL block sparse GEMM kernel.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --a_rows=<int> Sets the number of the rows of the sparse matrix.
\n
"
<<
" --n=<int> Sets the N dimension.
\n
"
<<
" --a_cols=<int> Sets the number of columns of the sparse matrix.
\n
"
<<
" --a_ell_num_columns=<int> Sets the actual number of columns of the Blocked-Ellpack format.
\n
"
<<
" --a_ell_blocksize=<int> Sets the size of the ELL-Block.
\n
"
<<
" --a_base=<int> Sets the base index.
\n
"
<<
" --alpha=<f32> Epilogue scalar alpha (real part)
\n
"
<<
" --beta=<f32> Epilogue scalar beta (real part)
\n\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n
"
<<
" --reference-check=<bool> If true, performs reference check.
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"# Runs a 1024x1024x1024 ELL block sparse GEMM with 16x16 block size and actual 512 non-zero columns in A operand
\n
"
<<
"$ ./examples/43_ell_block_sparse_gemm/43_ell_block_sparse_gemm --a_rows=1024 --n=1024 --a_cols=1024 --a_ell_num_columns=512 --a_ell_blocksize=16
\n\n
"
;
return
out
;
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of real-valued multiply-adds
int64_t
fmas
=
(
int64_t
)
a_rows
*
(
int64_t
)
a_cols
*
(
int64_t
)
n
;
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
template
<
typename
Gemm
>
class
Testbed
{
public:
//
// Type definitions
//
using
ElementA
=
typename
Gemm
::
ElementA
;
using
ElementB
=
typename
Gemm
::
ElementB
;
using
ElementC
=
typename
Gemm
::
ElementC
;
using
ElementAccumulator
=
typename
Gemm
::
ElementAccumulator
;
using
EpilogueOutputOp
=
typename
Gemm
::
GemmKernel
::
Epilogue
::
OutputOp
;
using
ElementCompute
=
typename
EpilogueOutputOp
::
ElementCompute
;
using
LayoutA
=
typename
Gemm
::
LayoutA
;
using
LayoutB
=
typename
Gemm
::
LayoutB
;
using
LayoutC
=
typename
Gemm
::
LayoutC
;
using
MatrixCoord
=
typename
LayoutC
::
TensorCoord
;
private:
//
// Data members
//
Options
options
;
/// Initialization
hytlass
::
Distribution
::
Kind
init_A
;
hytlass
::
Distribution
::
Kind
init_B
;
hytlass
::
Distribution
::
Kind
init_C
;
hytlass
::
Distribution
::
Kind
init_ELL
;
uint32_t
seed
;
hytlass
::
HostTensor
<
ElementA
,
LayoutA
>
tensor_a
;
hytlass
::
HostTensor
<
ElementB
,
LayoutB
>
tensor_b
;
hytlass
::
HostTensor
<
ElementC
,
LayoutC
>
tensor_c
;
hytlass
::
HostTensor
<
ElementC
,
LayoutC
>
tensor_d
;
hytlass
::
HostTensor
<
ElementA
,
LayoutA
>
tensor_a_uncompressed
;
hytlass
::
HostTensor
<
ElementC
,
LayoutC
>
reference_d
;
hytlass
::
HostTensor
<
int32_t
,
LayoutA
>
tensor_ell_idx
;
public:
//
// Methods
//
Testbed
(
Options
const
&
options_
,
hytlass
::
Distribution
::
Kind
init_A_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_B_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_C_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_ELL_
=
hytlass
::
Distribution
::
Uniform
,
uint32_t
seed_
=
3080
)
:
options
(
options_
),
init_A
(
init_A_
),
init_B
(
init_B_
),
init_C
(
init_C_
),
init_ELL
(
init_ELL_
),
seed
(
seed_
)
{
}
private:
/// Helper to initialize a tensor view
template
<
typename
Element
,
typename
Layout
>
void
initialize_tensor_
(
hytlass
::
TensorView
<
Element
,
Layout
>
view
,
hytlass
::
Distribution
::
Kind
dist_kind
,
uint32_t
seed
)
{
if
(
dist_kind
==
hytlass
::
Distribution
::
Uniform
)
{
Element
scope_max
,
scope_min
;
int
bits_input
=
hytlass
::
sizeof_bits
<
Element
>::
value
;
int
bits_output
=
hytlass
::
sizeof_bits
<
typename
Gemm
::
ElementC
>::
value
;
if
(
bits_input
==
1
)
{
scope_max
=
2
;
scope_min
=
0
;
}
else
if
(
bits_input
<=
8
)
{
scope_max
=
2
;
scope_min
=
-
2
;
}
else
if
(
bits_output
==
16
)
{
if
(
hytlass
::
sizeof_bits
<
ElementAccumulator
>::
value
<=
16
)
{
scope_max
=
5
;
scope_min
=
-
5
;
}
else
{
scope_max
=
8
;
scope_min
=
-
8
;
}
}
else
{
scope_max
=
8
;
scope_min
=
-
8
;
}
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
view
,
seed
,
scope_max
,
scope_min
,
0
);
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Gaussian
)
{
hytlass
::
reference
::
host
::
TensorFillRandomGaussian
(
view
,
seed
,
Element
(),
Element
(
0.5
f
));
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Sequential
)
{
// Fill with increasing elements
hytlass
::
reference
::
host
::
BlockFillSequential
(
view
.
data
(),
view
.
capacity
(),
Element
(
1
),
Element
());
}
else
{
// Fill with all 1s
hytlass
::
reference
::
host
::
BlockFillSequential
(
view
.
data
(),
view
.
capacity
(),
Element
(),
Element
(
1
));
}
}
/// Initializes data structures
void
initialize_
()
{
tensor_a
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
a_ell_num_columns
));
tensor_b
.
resize
(
hytlass
::
make_Coord
(
options
.
a_cols
,
options
.
n
));
tensor_c
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
n
));
tensor_d
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
n
));
tensor_a_uncompressed
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
a_cols
));
reference_d
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
n
));
tensor_ell_idx
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
/
options
.
a_ell_blocksize
,
options
.
a_ell_num_columns
/
options
.
a_ell_blocksize
));
//
// Initialize the problems of the workspace
//
initialize_tensor_
(
tensor_a
.
host_view
(),
init_A
,
seed
*
2021
);
initialize_tensor_
(
tensor_b
.
host_view
(),
init_B
,
seed
*
2022
);
initialize_tensor_
(
tensor_c
.
host_view
(),
init_C
,
seed
*
2023
);
if
(
init_ELL
==
hytlass
::
Distribution
::
Uniform
)
{
hytlass
::
reference
::
host
::
TensorFillRandomEllIdx
(
tensor_ell_idx
.
host_view
(),
seed
,
options
.
a_rows
/
options
.
a_ell_blocksize
,
options
.
a_ell_num_columns
/
options
.
a_ell_blocksize
,
options
.
a_cols
/
options
.
a_ell_blocksize
);
}
else
{
for
(
int
i
=
0
;
i
<
options
.
a_rows
/
options
.
a_ell_blocksize
;
++
i
)
{
for
(
int
j
=
0
;
j
<
options
.
a_ell_num_columns
/
options
.
a_ell_blocksize
;
++
j
)
{
tensor_ell_idx
.
at
({
i
,
j
})
=
j
+
3
;
}
}
}
hytlass
::
reference
::
host
::
TensorCopy
(
reference_d
.
host_view
(),
tensor_c
.
host_view
());
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_c
.
sync_device
();
tensor_d
.
sync_device
();
tensor_ell_idx
.
sync_device
();
}
/// Verifies the result is a GEMM
bool
verify_
()
{
bool
passed
=
true
;
tensor_d
.
sync_host
();
hytlass
::
uncompress_ell_block_sparse
(
tensor_a_uncompressed
.
host_ref
(),
tensor_a
.
host_ref
(),
tensor_ell_idx
.
host_ref
(),
options
.
a_rows
,
options
.
a_cols
,
options
.
a_ell_num_columns
,
options
.
a_ell_blocksize
);
hytlass
::
reference
::
host
::
Gemm
<
typename
Gemm
::
ElementA
,
typename
Gemm
::
LayoutA
,
typename
Gemm
::
ElementB
,
typename
Gemm
::
LayoutB
,
typename
Gemm
::
ElementC
,
typename
Gemm
::
LayoutC
,
ElementCompute
,
ElementAccumulator
,
typename
Gemm
::
Operator
>
reference_gemm
;
reference_gemm
(
{
options
.
a_rows
,
options
.
n
,
options
.
a_cols
},
options
.
alpha
,
tensor_a_uncompressed
.
host_ref
(),
tensor_b
.
host_ref
(),
options
.
beta
,
reference_d
.
host_ref
(),
ElementAccumulator
(
0
)
);
// Reference check
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
tensor_d
.
host_view
(),
reference_d
.
host_view
());
if
(
!
passed
)
{
std
::
cerr
<<
"
\n
***
\n
Error - problem failed the QA check
\n
***
\n
"
<<
std
::
endl
;
std
::
stringstream
fname
;
fname
<<
"error_43_ell_block_sparse_gemm"
<<
"mnk_"
<<
options
.
a_rows
<<
"x"
<<
options
.
n
<<
"x"
<<
options
.
a_cols
<<
"_"
<<
options
.
a_ell_num_columns
<<
"_"
<<
options
.
a_ell_blocksize
<<
".txt"
;
std
::
cout
<<
fname
.
str
()
<<
std
::
endl
;
std
::
ofstream
results
(
fname
.
str
());
results
<<
"alpha: "
<<
ElementCompute
(
options
.
alpha
)
<<
"
\n
"
<<
"beta: "
<<
ElementCompute
(
options
.
beta
)
<<
"
\n
"
<<
"block size: "
<<
options
.
a_ell_blocksize
<<
"
\n
"
<<
"
\n
A:
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n
"
<<
"
\n
A Ell Index:
\n
"
<<
tensor_ell_idx
.
host_view
()
<<
"
\n
"
<<
"
\n
B:
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n
"
<<
"
\n
C:
\n
"
<<
tensor_c
.
host_view
()
<<
"
\n
"
<<
"
\n
D reference:
\n
"
<<
reference_d
.
host_view
()
<<
"
\n
"
<<
"
\n
D computed:
\n
"
<<
tensor_d
.
host_view
()
<<
"
\n
"
;
return
passed
;
}
return
passed
;
}
public:
/// Returns the number of threadblocks to launch if the kernel can run on the target
/// device. Otherwise, returns zero.
bool
sufficient
()
const
{
//
// Determine SMEM requirements and waive if not satisfied
//
size_t
smem_size
=
sizeof
(
typename
Gemm
::
GemmKernel
::
SharedStorage
);
hipDeviceProp_t
properties
;
int
device_idx
;
hipError_t
result
=
hipGetDevice
(
&
device_idx
);
if
(
result
!=
hipSuccess
)
{
throw
std
::
runtime_error
(
"hipGetDevice() API call failed."
);
}
result
=
hipGetDeviceProperties
(
&
properties
,
device_idx
);
if
(
result
!=
hipSuccess
)
{
throw
std
::
runtime_error
(
"hipGetDeviceProperties() failed"
);
}
if
(
properties
.
sharedMemPerBlock
<
smem_size
)
{
return
false
;
}
return
true
;
}
/// Executes a BlockedEll SpMM kernel and measures runtime.
Result
profile
()
{
Result
result
;
// Early exit
if
(
!
sufficient
())
{
std
::
cout
<<
"Active HIP device lacks hardware resources to run HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
result
.
passed
=
false
;
// Initialize the problem
initialize_
();
// Configure the GEMM arguments
typename
EpilogueOutputOp
::
Params
epilogue_op
(
options
.
alpha
,
options
.
beta
);
// Configure GEMM arguments
typename
Gemm
::
Arguments
args
(
{
options
.
a_rows
,
options
.
n
,
options
.
a_cols
},
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
tensor_c
.
device_ref
(),
tensor_d
.
device_ref
(),
tensor_ell_idx
.
device_data
(),
options
.
a_ell_num_columns
,
options
.
a_ell_blocksize
,
options
.
a_base
,
epilogue_op
);
// Initialize the GEMM object
Gemm
gemm
{};
result
.
status
=
gemm
.
initialize
(
args
);
if
(
result
.
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Failed to initialize HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
// Run the BlockedEll SpMM object
result
.
status
=
gemm
.
run
();
if
(
result
.
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Failed to run HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
// Wait for completion
result
.
error
=
hipDeviceSynchronize
();
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"Kernel execution error: "
<<
hipGetErrorString
(
result
.
error
);
return
result
;
}
//
// Verify correctness
//
result
.
passed
=
true
;
if
(
options
.
reference_check
)
{
result
.
passed
=
verify_
();
}
//
// Warm-up run
//
result
.
status
=
gemm
.
run
();
if
(
result
.
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Failed to run HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
//
// Construct events
//
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
}
// Record an event at the start of a series of GEMM operations
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
//
// Run profiling loop
//
for
(
int
iter
=
0
;
iter
<
options
.
iterations
;
++
iter
)
{
gemm
();
}
//
// Stop profiling loop
//
// Record an event when the GEMM operations have been launched.
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Compute average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
//
// Cleanup
//
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"ELL Block Sparse GEMM (HYTLASS):
\n
"
<<
"===================================================="
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
" "
<<
"Runtime: "
<<
result
.
runtime_ms
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
" "
<<
" GFLOPs: "
<<
result
.
gflops
<<
std
::
endl
;
return
result
;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
//
// This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
//
hipDeviceProp_t
props
;
hipError_t
error
=
hipGetDeviceProperties
(
&
props
,
0
);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipGetDeviceProperties() returned an error: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
-
1
;
}
//
// Parse options
//
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
//
// Define the BlockedEll type
//
using
ElementA
=
hytlass
::
half_t
;
using
ElementB
=
hytlass
::
half_t
;
using
ElementOutput
=
hytlass
::
half_t
;
using
ElementAccumulator
=
float
;
using
LayoutA
=
hytlass
::
layout
::
RowMajor
;
using
LayoutB
=
hytlass
::
layout
::
RowMajor
;
using
LayoutC
=
hytlass
::
layout
::
RowMajor
;
constexpr
int32_t
kAlignmentA
=
128
/
hytlass
::
sizeof_bits
<
ElementA
>::
value
;
constexpr
int32_t
kAlignmentB
=
128
/
hytlass
::
sizeof_bits
<
ElementB
>::
value
;
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
128
,
128
,
32
>
;
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
64
,
64
,
32
>
;
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
;
constexpr
int32_t
kStages
=
2
;
using
Gemm
=
typename
hytlass
::
gemm
::
device
::
EllGemm
<
ElementA
,
LayoutA
,
ElementB
,
LayoutB
,
ElementOutput
,
LayoutC
,
ElementAccumulator
,
hytlass
::
arch
::
OpClassTensorOp
,
hytlass
::
arch
::
Gfx928
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
hytlass
::
epilogue
::
thread
::
LinearCombination
<
ElementOutput
,
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
ElementAccumulator
,
ElementAccumulator
>
,
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<
8
>
,
kStages
,
kAlignmentA
,
kAlignmentB
>
;
//
// Profile it
//
Testbed
<
Gemm
>
testbed
(
options
);
if
(
!
testbed
.
sufficient
())
{
std
::
cout
<<
"The active HIP device lacks sufficient hardware resources to execute this kernel.
\n
"
;
return
0
;
}
Result
result
=
testbed
.
profile
();
if
(
!
result
.
passed
)
{
std
::
cout
<<
"Profiling HYTLASS ELL block sparse GEMM has failed.
\n
"
;
std
::
cout
<<
"
\n
Failed
\n
"
;
return
-
1
;
}
std
::
cout
<<
"
\n
Passed
\n
"
;
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/17_ell_block_sparse_gemm/ell_block_sparse_gemm_bias_relu.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Block-Ell sparse gemm example.
This example performs a Sparse-matrix dense-matrix multiplication (SpMM) operation.
Matrix A is stored in the Blocked-Ellpack (Blocked-ELL) storage format.
Details about the Blocked-Ellpack (Blocked-ELL) storage format can be found here:
https://docs.nvidia.com/cuda/cusparse/index.html#cusparse-generic-spmat-create-blockedell
Whereas matrix B is a dense matrix.
Blocked-Ellpack or Blocked-ELL storage format comprises of two matrices.
First is a packed matrix (ellValue matrix) that stores non-zero values in consecutive blocks,
represented by tensor_a in this example. Second is a matrix of indices (ellColInd matrix),
represented by tensor_ell_idx in this example, that represent the column indices of the
corresponding non-zero blocks. All rows in the matrices must have the same number of blocks.
ellColInd can contain -1 values for indicating empty blocks. These matrices store elements in
row-major order.
Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format
for this example:
a_rows - Rows in the sparse matrix.
a_cols - Colums in the sparse matrix.
a_ell_blocksize - Size of the ELL-Blocks.
a_ell_num_columns - Number of columns in the Blocked-Ellpack format (ellValue columns)
tensor_a - ellValue matrix, whose size is (a_rows * a_ell_num_columns)
tensor_ell_idx - Blocked-ELL Column indices (ellColInd), whose size is
(a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
tensor_b - Input dense matrix whose size is (a_cols * n)
tensor_c/tensor_d - Output dense matrix whose size is (a_rows * n)
{a_rows, n, a_cols} - Problem size
*/
/////////////////////////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <unordered_map>
#include "hytlass/hytlass.h"
#include "hytlass/gemm/gemm.h"
#include "hytlass/gemm/kernel/gemm_grouped.h"
#include "hytlass/gemm/kernel/default_gemm_grouped.h"
#include "hytlass/gemm/device/ell_gemm.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/distribution.h"
#include "hytlass/util/device_memory.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/host/gemm.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/tensor_norm.h"
#include "hytlass/util/host_uncompress.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
// Note this example only works for RowMajor output
// ell_block_sparse_gemm + bias + relu
/// Result structure
struct
Result
{
double
runtime_ms
;
double
gflops
;
hytlass
::
Status
status
;
hipError_t
error
;
bool
passed
;
//
// Methods
//
Result
(
double
runtime_ms
=
0
,
double
gflops
=
0
,
hytlass
::
Status
status
=
hytlass
::
Status
::
kSuccess
,
hipError_t
error
=
hipSuccess
)
:
runtime_ms
(
runtime_ms
),
gflops
(
gflops
),
status
(
status
),
error
(
error
),
passed
(
true
)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct
Options
{
bool
help
;
bool
reference_check
;
int
iterations
;
int
hip_streams
;
int
a_rows
,
n
,
a_cols
;
int
a_ell_num_columns
;
int
a_ell_blocksize
;
int
a_base
;
float
alpha
;
//
// Methods
//
Options
()
:
help
(
false
),
reference_check
(
true
),
iterations
(
20
),
hip_streams
(
0
),
a_rows
(
1024
),
n
(
1024
),
a_cols
(
1024
),
a_ell_num_columns
(
512
),
a_ell_blocksize
(
16
),
a_base
(
0
),
alpha
(
1
)
{
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
}
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
,
1.0
f
);
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
,
20
);
cmd
.
get_cmd_line_argument
(
"streams"
,
hip_streams
,
0
);
cmd
.
get_cmd_line_argument
(
"reference-check"
,
reference_check
,
true
);
cmd
.
get_cmd_line_argument
(
"a_rows"
,
a_rows
,
1024
);
cmd
.
get_cmd_line_argument
(
"n"
,
n
,
1024
);
cmd
.
get_cmd_line_argument
(
"a_cols"
,
a_cols
,
1024
);
cmd
.
get_cmd_line_argument
(
"a_ell_num_columns"
,
a_ell_num_columns
,
512
);
cmd
.
get_cmd_line_argument
(
"a_ell_blocksize"
,
a_ell_blocksize
,
16
);
cmd
.
get_cmd_line_argument
(
"a_base"
,
a_base
,
0
);
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"43_ell_block_sparse_gemm
\n\n
"
<<
" This example profiles the performance of a ELL block sparse GEMM kernel.
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement.
\n\n
"
<<
" --a_rows=<int> Sets the number of the rows of the sparse matrix.
\n
"
<<
" --n=<int> Sets the N dimension.
\n
"
<<
" --a_cols=<int> Sets the number of columns of the sparse matrix.
\n
"
<<
" --a_ell_num_columns=<int> Sets the actual number of columns of the Blocked-Ellpack format.
\n
"
<<
" --a_ell_blocksize=<int> Sets the size of the ELL-Block.
\n
"
<<
" --a_base=<int> Sets the base index.
\n
"
<<
" --alpha=<f32> Epilogue scalar alpha (real part)
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform.
\n
"
<<
" --reference-check=<bool> If true, performs reference check.
\n
"
;
out
<<
"
\n\n
Examples:
\n\n
"
<<
"# Runs a 1024x1024x1024 ELL block sparse GEMM with 16x16 block size and actual 512 non-zero columns in A operand
\n
"
<<
"$ ./examples/43_ell_block_sparse_gemm/43_ell_block_sparse_gemm --a_rows=1024 --n=1024 --a_cols=1024 --a_ell_num_columns=512 --a_ell_blocksize=16
\n\n
"
;
return
out
;
}
/// Compute performance in GFLOP/s
double
gflops
(
double
runtime_s
)
const
{
// Number of real-valued multiply-adds
int64_t
fmas
=
(
int64_t
)
a_rows
*
(
int64_t
)
a_cols
*
(
int64_t
)
n
;
// Two flops per multiply-add
return
2.0
*
double
(
fmas
)
/
double
(
1.0e9
)
/
runtime_s
;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
template
<
typename
Gemm
>
class
Testbed
{
public:
//
// Type definitions
//
using
ElementA
=
typename
Gemm
::
ElementA
;
using
ElementB
=
typename
Gemm
::
ElementB
;
using
ElementC
=
typename
Gemm
::
ElementC
;
using
ElementAccumulator
=
typename
Gemm
::
ElementAccumulator
;
using
EpilogueOutputOp
=
typename
Gemm
::
GemmKernel
::
Epilogue
::
OutputOp
;
using
ElementCompute
=
typename
EpilogueOutputOp
::
ElementCompute
;
using
LayoutA
=
typename
Gemm
::
LayoutA
;
using
LayoutB
=
typename
Gemm
::
LayoutB
;
using
LayoutC
=
typename
Gemm
::
LayoutC
;
using
MatrixCoord
=
typename
LayoutC
::
TensorCoord
;
private:
//
// Data members
//
Options
options
;
/// Initialization
hytlass
::
Distribution
::
Kind
init_A
;
hytlass
::
Distribution
::
Kind
init_B
;
hytlass
::
Distribution
::
Kind
init_C
;
hytlass
::
Distribution
::
Kind
init_ELL
;
uint32_t
seed
;
hytlass
::
HostTensor
<
ElementA
,
LayoutA
>
tensor_a
;
hytlass
::
HostTensor
<
ElementB
,
LayoutB
>
tensor_b
;
hytlass
::
HostTensor
<
ElementC
,
LayoutC
>
tensor_c_bias
;
hytlass
::
HostTensor
<
ElementC
,
LayoutC
>
tensor_d
;
hytlass
::
HostTensor
<
ElementA
,
LayoutA
>
tensor_a_uncompressed
;
hytlass
::
HostTensor
<
ElementC
,
LayoutC
>
reference_d
;
hytlass
::
HostTensor
<
int32_t
,
LayoutA
>
tensor_ell_idx
;
public:
//
// Methods
//
Testbed
(
Options
const
&
options_
,
hytlass
::
Distribution
::
Kind
init_A_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_B_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_C_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_ELL_
=
hytlass
::
Distribution
::
Uniform
,
uint32_t
seed_
=
3080
)
:
options
(
options_
),
init_A
(
init_A_
),
init_B
(
init_B_
),
init_C
(
init_C_
),
init_ELL
(
init_ELL_
),
seed
(
seed_
)
{
}
private:
/// Helper to initialize a tensor view
template
<
typename
Element
,
typename
Layout
>
void
initialize_tensor_
(
hytlass
::
TensorView
<
Element
,
Layout
>
view
,
hytlass
::
Distribution
::
Kind
dist_kind
,
uint32_t
seed
)
{
if
(
dist_kind
==
hytlass
::
Distribution
::
Uniform
)
{
Element
scope_max
,
scope_min
;
int
bits_input
=
hytlass
::
sizeof_bits
<
Element
>::
value
;
int
bits_output
=
hytlass
::
sizeof_bits
<
typename
Gemm
::
ElementC
>::
value
;
if
(
bits_input
==
1
)
{
scope_max
=
2
;
scope_min
=
0
;
}
else
if
(
bits_input
<=
8
)
{
scope_max
=
2
;
scope_min
=
-
2
;
}
else
if
(
bits_output
==
16
)
{
if
(
hytlass
::
sizeof_bits
<
ElementAccumulator
>::
value
<=
16
)
{
scope_max
=
5
;
scope_min
=
-
5
;
}
else
{
scope_max
=
8
;
scope_min
=
-
8
;
}
}
else
{
scope_max
=
8
;
scope_min
=
-
8
;
}
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
view
,
seed
,
scope_max
,
scope_min
,
0
);
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Gaussian
)
{
hytlass
::
reference
::
host
::
TensorFillRandomGaussian
(
view
,
seed
,
Element
(),
Element
(
0.5
f
));
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Sequential
)
{
// Fill with increasing elements
hytlass
::
reference
::
host
::
BlockFillSequential
(
view
.
data
(),
view
.
capacity
(),
Element
(
1
),
Element
());
}
else
{
// Fill with all 1s
hytlass
::
reference
::
host
::
BlockFillSequential
(
view
.
data
(),
view
.
capacity
(),
Element
(),
Element
(
1
));
}
}
/// Initializes data structures
void
initialize_
()
{
tensor_a
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
a_ell_num_columns
));
tensor_b
.
resize
(
hytlass
::
make_Coord
(
options
.
a_cols
,
options
.
n
));
tensor_c_bias
.
resize
(
hytlass
::
make_Coord
(
1
,
options
.
a_rows
));
tensor_d
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
n
));
tensor_a_uncompressed
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
a_cols
));
reference_d
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
,
options
.
n
));
tensor_ell_idx
.
resize
(
hytlass
::
make_Coord
(
options
.
a_rows
/
options
.
a_ell_blocksize
,
options
.
a_ell_num_columns
/
options
.
a_ell_blocksize
));
//
// Initialize the problems of the workspace
//
initialize_tensor_
(
tensor_a
.
host_view
(),
init_A
,
seed
*
2021
);
initialize_tensor_
(
tensor_b
.
host_view
(),
init_B
,
seed
*
2022
);
initialize_tensor_
(
tensor_c_bias
.
host_view
(),
init_C
,
seed
*
2023
);
if
(
init_ELL
==
hytlass
::
Distribution
::
Uniform
)
{
hytlass
::
reference
::
host
::
TensorFillRandomEllIdx
(
tensor_ell_idx
.
host_view
(),
seed
,
options
.
a_rows
/
options
.
a_ell_blocksize
,
options
.
a_ell_num_columns
/
options
.
a_ell_blocksize
,
options
.
a_cols
/
options
.
a_ell_blocksize
);
}
else
{
for
(
int
i
=
0
;
i
<
options
.
a_rows
/
options
.
a_ell_blocksize
;
++
i
)
{
for
(
int
j
=
0
;
j
<
options
.
a_ell_num_columns
/
options
.
a_ell_blocksize
;
++
j
)
{
tensor_ell_idx
.
at
({
i
,
j
})
=
j
+
3
;
}
}
}
tensor_a
.
sync_device
();
tensor_b
.
sync_device
();
tensor_c_bias
.
sync_device
();
tensor_d
.
sync_device
();
tensor_ell_idx
.
sync_device
();
}
/// Verifies the result is a GEMM
bool
verify_
()
{
bool
passed
=
true
;
tensor_d
.
sync_host
();
hytlass
::
uncompress_ell_block_sparse
(
tensor_a_uncompressed
.
host_ref
(),
tensor_a
.
host_ref
(),
tensor_ell_idx
.
host_ref
(),
options
.
a_rows
,
options
.
a_cols
,
options
.
a_ell_num_columns
,
options
.
a_ell_blocksize
);
hytlass
::
reference
::
host
::
Gemm
<
typename
Gemm
::
ElementA
,
typename
Gemm
::
LayoutA
,
typename
Gemm
::
ElementB
,
typename
Gemm
::
LayoutB
,
typename
Gemm
::
ElementC
,
typename
Gemm
::
LayoutC
,
ElementCompute
,
ElementAccumulator
,
typename
Gemm
::
Operator
>
reference_gemm
;
reference_gemm
(
{
options
.
a_rows
,
options
.
n
,
options
.
a_cols
},
options
.
alpha
,
tensor_a_uncompressed
.
host_ref
(),
tensor_b
.
host_ref
(),
0
,
reference_d
.
host_ref
(),
ElementAccumulator
(
0
)
);
// Compute bias + relu in host code
for
(
int
i
=
0
;
i
<
options
.
a_rows
;
++
i
)
{
for
(
int
j
=
0
;
j
<
options
.
n
;
++
j
)
{
reference_d
.
at
({
i
,
j
})
=
std
::
max
(
ElementC
(
0
),
ElementC
(
reference_d
.
at
({
i
,
j
})
+
tensor_c_bias
.
at
({
0
,
j
}))
);
}
}
// Reference check
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
tensor_d
.
host_view
(),
reference_d
.
host_view
());
if
(
!
passed
)
{
std
::
cerr
<<
"
\n
***
\n
Error - problem failed the QA check
\n
***
\n
"
<<
std
::
endl
;
std
::
stringstream
fname
;
fname
<<
"error_43_ell_block_sparse_gemm"
<<
"mnk_"
<<
options
.
a_rows
<<
"x"
<<
options
.
n
<<
"x"
<<
options
.
a_cols
<<
"_"
<<
options
.
a_ell_num_columns
<<
"_"
<<
options
.
a_ell_blocksize
<<
".txt"
;
std
::
cout
<<
fname
.
str
()
<<
std
::
endl
;
std
::
ofstream
results
(
fname
.
str
());
results
<<
"alpha: "
<<
ElementCompute
(
options
.
alpha
)
<<
"
\n
"
<<
"block size: "
<<
options
.
a_ell_blocksize
<<
"
\n
"
<<
"
\n
A:
\n
"
<<
tensor_a
.
host_view
()
<<
"
\n
"
<<
"
\n
A Ell Index:
\n
"
<<
tensor_ell_idx
.
host_view
()
<<
"
\n
"
<<
"
\n
B:
\n
"
<<
tensor_b
.
host_view
()
<<
"
\n
"
<<
"
\n
C:
\n
"
<<
tensor_c_bias
.
host_view
()
<<
"
\n
"
<<
"
\n
D reference:
\n
"
<<
reference_d
.
host_view
()
<<
"
\n
"
<<
"
\n
D computed:
\n
"
<<
tensor_d
.
host_view
()
<<
"
\n
"
;
return
passed
;
}
return
passed
;
}
public:
/// Returns the number of threadblocks to launch if the kernel can run on the target
/// device. Otherwise, returns zero.
bool
sufficient
()
const
{
//
// Determine SMEM requirements and waive if not satisfied
//
size_t
smem_size
=
sizeof
(
typename
Gemm
::
GemmKernel
::
SharedStorage
);
hipDeviceProp_t
properties
;
int
device_idx
;
hipError_t
result
=
hipGetDevice
(
&
device_idx
);
if
(
result
!=
hipSuccess
)
{
throw
std
::
runtime_error
(
"hipGetDevice() API call failed."
);
}
result
=
hipGetDeviceProperties
(
&
properties
,
device_idx
);
if
(
result
!=
hipSuccess
)
{
throw
std
::
runtime_error
(
"hipGetDeviceProperties() failed"
);
}
if
(
properties
.
sharedMemPerBlock
<
smem_size
)
{
return
false
;
}
return
true
;
}
/// Executes a BlockedEll SpMM kernel and measures runtime.
Result
profile
()
{
Result
result
;
// Early exit
if
(
!
sufficient
())
{
std
::
cout
<<
"Active hip device lacks hardware resources to run HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
result
.
passed
=
false
;
// Initialize the problem
initialize_
();
// Configure the GEMM arguments
typename
EpilogueOutputOp
::
Params
epilogue_op
(
options
.
alpha
);
// Configure GEMM arguments
typename
Gemm
::
Arguments
args
(
{
options
.
a_rows
,
options
.
n
,
options
.
a_cols
},
tensor_a
.
device_ref
(),
tensor_b
.
device_ref
(),
{
tensor_c_bias
.
device_data
(),
0
},
tensor_d
.
device_ref
(),
tensor_ell_idx
.
device_data
(),
options
.
a_ell_num_columns
,
options
.
a_ell_blocksize
,
options
.
a_base
,
epilogue_op
);
// Initialize the GEMM object
Gemm
gemm
{};
result
.
status
=
gemm
.
initialize
(
args
);
if
(
result
.
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Failed to initialize HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
// Run the BlockedEll SpMM object
result
.
status
=
gemm
.
run
();
if
(
result
.
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Failed to run HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
// Wait for completion
result
.
error
=
hipDeviceSynchronize
();
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"Kernel execution error: "
<<
hipGetErrorString
(
result
.
error
);
return
result
;
}
//
// Verify correctness
//
result
.
passed
=
true
;
if
(
options
.
reference_check
)
{
result
.
passed
=
verify_
();
}
//
// Warm-up run
//
result
.
status
=
gemm
.
run
();
if
(
result
.
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Failed to run HYTLASS BlockedEll SpMM kernel."
<<
std
::
endl
;
return
result
;
}
//
// Construct events
//
hipEvent_t
events
[
2
];
for
(
auto
&
event
:
events
)
{
result
.
error
=
hipEventCreate
(
&
event
);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
-
1
;
}
}
// Record an event at the start of a series of GEMM operations
result
.
error
=
hipEventRecord
(
events
[
0
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
//
// Run profiling loop
//
for
(
int
iter
=
0
;
iter
<
options
.
iterations
;
++
iter
)
{
gemm
();
}
//
// Stop profiling loop
//
// Record an event when the GEMM operations have been launched.
result
.
error
=
hipEventRecord
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Wait for work on the device to complete.
result
.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
result
.
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
result
.
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
result
.
error
)
<<
std
::
endl
;
return
result
;
}
// Compute average runtime and GFLOPs.
result
.
runtime_ms
=
double
(
runtime_ms
)
/
double
(
options
.
iterations
);
result
.
gflops
=
options
.
gflops
(
result
.
runtime_ms
/
1000.0
);
//
// Cleanup
//
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"ELL Block Sparse GEMM (HYTLASS):
\n
"
<<
"===================================================="
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
" "
<<
"Runtime: "
<<
result
.
runtime_ms
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
" "
<<
" GFLOPs: "
<<
result
.
gflops
<<
std
::
endl
;
return
result
;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
args
)
{
//
// This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
//
hipDeviceProp_t
props
;
hipError_t
error
=
hipGetDeviceProperties
(
&
props
,
0
);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipGetDeviceProperties() returned an error: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
-
1
;
}
//
// Parse options
//
Options
options
;
options
.
parse
(
argc
,
args
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
//
// Define the BlockedEll type
//
using
ElementA
=
hytlass
::
half_t
;
using
ElementB
=
hytlass
::
half_t
;
using
ElementOutput
=
hytlass
::
half_t
;
using
ElementAccumulator
=
float
;
using
LayoutA
=
hytlass
::
layout
::
RowMajor
;
using
LayoutB
=
hytlass
::
layout
::
RowMajor
;
using
LayoutC
=
hytlass
::
layout
::
RowMajor
;
constexpr
int32_t
kAlignmentA
=
128
/
hytlass
::
sizeof_bits
<
ElementA
>::
value
;
constexpr
int32_t
kAlignmentB
=
128
/
hytlass
::
sizeof_bits
<
ElementB
>::
value
;
using
ThreadblockShape
=
hytlass
::
gemm
::
GemmShape
<
128
,
128
,
32
>
;
using
WarpShape
=
hytlass
::
gemm
::
GemmShape
<
64
,
64
,
32
>
;
using
InstructionShape
=
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
;
// Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
//
// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij )
//
using
EpilogueOp
=
hytlass
::
epilogue
::
thread
::
LinearCombinationRelu
<
ElementOutput
,
// <- data type of output matrix
128
/
hytlass
::
sizeof_bits
<
ElementOutput
>::
value
,
// <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator
,
// <- data type of accumulator
ElementAccumulator
,
// <- data type for alpha in linear combination function
hytlass
::
epilogue
::
thread
::
ScaleType
::
NoBetaScaling
>
;
// <- alpha x C + bias
constexpr
int32_t
kStages
=
2
;
using
Gemm
=
typename
hytlass
::
gemm
::
device
::
EllGemm
<
ElementA
,
LayoutA
,
ElementB
,
LayoutB
,
ElementOutput
,
LayoutC
,
ElementAccumulator
,
hytlass
::
arch
::
OpClassTensorOp
,
hytlass
::
arch
::
Gfx928
,
ThreadblockShape
,
WarpShape
,
InstructionShape
,
EpilogueOp
,
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<
8
>
,
kStages
,
kAlignmentA
,
kAlignmentB
>
;
//
// Profile it
//
Testbed
<
Gemm
>
testbed
(
options
);
if
(
!
testbed
.
sufficient
())
{
std
::
cout
<<
"The active device lacks sufficient hardware resources to execute this kernel.
\n
"
;
return
0
;
}
Result
result
=
testbed
.
profile
();
if
(
!
result
.
passed
)
{
std
::
cout
<<
"Profiling HYTLASS ELL block sparse GEMM has failed.
\n
"
;
std
::
cout
<<
"
\n
Failed
\n
"
;
return
-
1
;
}
std
::
cout
<<
"
\n
Passed
\n
"
;
return
0
;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
examples/18_gemm_with_abs_max/CMakeLists.txt
0 → 100644
View file @
d22dbec2
# Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
hytlass_example_add_executable
(
gemm_with_absmax
gemm_with_absmax.cu
)
examples/18_gemm_with_abs_max/gemm_with_absmax.cu
0 → 100644
View file @
d22dbec2
/***************************************************************************************************
* Copyright (c) 2023 - 2025 Hygon Information Technology Co., Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Example of running an scale AbsMax GEMM.
In addition to using Fp16 Tensor Core instructions, the GEMM uses a distinct epilogue
that enables additional scaling of operands/outputs, storing a pre-activation-function output
tensor (called the "auxiliary" output), and computing the absolute maximum value of the
outputs.
Pseudocode for this epilogue is as follows:
Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
D = activation(Aux)
if Aux is fp8 type:
abs_max_output = max( abs(aux) | (for every aux in Aux))
Aux = scale_aux * Aux
endif
if D is fp8 type:
abs_max_output = max( abs(d) | (for every d in D))
D = scale_d * D
endif
Parameter Aux is optionally stored to global memory
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "hytlass/hytlass.h"
#include "hytlass/numeric_conversion.h"
#include "hytlass/util/command_line.h"
#include "hytlass/util/host_tensor.h"
#include "hytlass/util/reference/host/gemm_complex.h"
#include "hytlass/util/tensor_view_io.h"
#include "hytlass/util/distribution.h"
#include "hytlass/util/reference/host/tensor_fill.h"
#include "hytlass/util/reference/host/tensor_copy.h"
#include "hytlass/util/reference/host/tensor_compare.h"
#include "hytlass/util/reference/host/tensor_norm.h"
#include "hytlass/util/reference/host/gemm.h"
#include "hytlass/epilogue/thread/activation.h"
#include "hytlass/epilogue/thread/linear_combination_generic_with_scaling.h"
#include "hytlass/gemm/device/gemm_universal_with_absmax.h"
#include "hytlass/layout/matrix.h"
#include "hytlass/matrix_coord.h"
#include "hytlass/gemm/device/gemm_universal_adapter.h"
using
ElementA
=
hytlass
::
half_t
;
using
ElementB
=
hytlass
::
half_t
;
using
ElementOutput
=
hytlass
::
float_e4m3_t
;
using
ElementAuxOutput
=
ElementOutput
;
using
ElementAccumulator
=
float
;
using
LayoutA
=
hytlass
::
layout
::
RowMajor
;
using
LayoutB
=
hytlass
::
layout
::
ColumnMajor
;
using
LayoutC
=
hytlass
::
layout
::
RowMajor
;
static
int
const
kStages
=
1
;
static
int
const
kAlignmentA
=
8
;
static
int
const
kAlignmentB
=
8
;
using
EpilogueOutputOp
=
hytlass
::
epilogue
::
thread
::
LinearCombinationGenericWithScalingAndAbsMax
<
hytlass
::
epilogue
::
thread
::
ReLu
,
ElementOutput
,
ElementAuxOutput
,
8
,
ElementAccumulator
,
ElementAccumulator
>
;
template
<
typename
MathOperator
>
using
Gemm_
=
hytlass
::
gemm
::
device
::
GemmUniversalWithAbsMax
<
ElementA
,
LayoutA
,
ElementB
,
LayoutB
,
ElementOutput
,
LayoutC
,
ElementAccumulator
,
hytlass
::
arch
::
OpClassTensorOp
,
hytlass
::
arch
::
Gfx928
,
hytlass
::
gemm
::
GemmShape
<
128
,
64
,
128
>
,
hytlass
::
gemm
::
GemmShape
<
64
,
32
,
128
>
,
hytlass
::
gemm
::
GemmShape
<
16
,
16
,
16
>
,
EpilogueOutputOp
,
hytlass
::
gemm
::
threadblock
::
GemmIdentityThreadblockSwizzle
<>
,
kStages
,
kAlignmentA
,
kAlignmentB
,
MathOperator
>
;
using
ElementAbsmax
=
typename
EpilogueOutputOp
::
ElementAbsmax
;
// Command line options parsing
struct
Options
{
bool
help
;
bool
error
;
bool
reference_check
;
hytlass
::
gemm
::
GemmCoord
problem_size
;
int
iterations
;
int
warmup_iterations
;
bool
scale_A
;
bool
scale_B
;
bool
scale_C
;
float
alpha
;
float
beta
;
Options
()
:
help
(
false
),
error
(
false
),
reference_check
(
false
),
iterations
(
20
),
warmup_iterations
(
5
),
scale_A
(
true
),
scale_B
(
true
),
scale_C
(
true
),
alpha
(
1.
f
),
beta
(
0.
f
)
{
}
// Parses the command line
void
parse
(
int
argc
,
char
const
**
args
)
{
hytlass
::
CommandLine
cmd
(
argc
,
args
);
if
(
cmd
.
check_cmd_line_flag
(
"help"
))
{
help
=
true
;
return
;
}
cmd
.
get_cmd_line_argument
(
"iterations"
,
iterations
,
20
);
cmd
.
get_cmd_line_argument
(
"warmup_iterations"
,
warmup_iterations
,
5
);
cmd
.
get_cmd_line_argument
(
"reference-check"
,
reference_check
,
false
);
cmd
.
get_cmd_line_argument
(
"scale-A"
,
scale_A
,
true
);
cmd
.
get_cmd_line_argument
(
"scale-B"
,
scale_B
,
true
);
cmd
.
get_cmd_line_argument
(
"scale-C"
,
scale_C
,
true
);
cmd
.
get_cmd_line_argument
(
"alpha"
,
alpha
,
1.
f
);
cmd
.
get_cmd_line_argument
(
"beta"
,
beta
,
0.
f
);
int
m
,
n
,
k
;
cmd
.
get_cmd_line_argument
(
"m"
,
m
,
1024
);
cmd
.
get_cmd_line_argument
(
"n"
,
n
,
1024
);
cmd
.
get_cmd_line_argument
(
"k"
,
k
,
1024
);
problem_size
=
hytlass
::
gemm
::
GemmCoord
{
m
,
n
,
k
};
}
/// Prints the usage statement.
std
::
ostream
&
print_usage
(
std
::
ostream
&
out
)
const
{
out
<<
"18_gemm_with_abs_max
\n\n
"
<<
" This example executes a GEMM with AbsMax Tensor Core operations. In addition to performing
\n
"
<<
" a normal GEMM, the kernel performs the following operations:
\n
"
<<
" Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
\n
"
<<
" D = activation(Aux)
\n\n
"
<<
" if Aux is fp8:
\n
"
<<
" abs_max_output = max( abs(aux) | (for every aux in Aux) )
\n
"
<<
" Aux = scale_aux * Aux
\n\n
"
<<
" if D is fp8 type:
\n
"
<<
" abs_max_output = max( abs(d) | (for every d in D) )
\n
"
<<
" D = scale_d * D
\n\n
"
<<
"Options:
\n\n
"
<<
" --help If specified, displays this usage statement
\n\n
"
<<
" --m=<int> Sets the M dimension of the GEMM
\n
"
<<
" --n=<int> Sets the N dimension of the GEMM
\n
"
<<
" --k=<int> Sets the K dimension of the GEMM
\n
"
<<
" --scale-A=<bool> Whether to apply a scaling factor to operand A (default: true)
\n
"
<<
" --scale-B=<bool> Whether to apply a scaling factor to operand B (default: true)
\n
"
<<
" --scale-C=<bool> Whether to apply a scaling factor to operand C (default: true)
\n
"
<<
" --iterations=<int> Number of profiling iterations to perform
\n
"
<<
" --warmup-iterations=<int> Number of warmup iterations to perform
\n
"
<<
" --reference-check=<bool> If true, performs reference check
\n
"
;
return
out
;
}
/// Compute performance in GFLOP/s
float
gflops
(
float
runtime_s
)
const
{
// Two flops per multiply-add
return
2.0
f
*
float
(
problem_size
.
product
())
/
float
(
1.0e9
)
/
runtime_s
;
}
};
/// Helper class to run the kernel
template
<
typename
Gemm
>
struct
TestbedRunner
{
using
ElementAccumulator
=
typename
Gemm
::
ElementAccumulator
;
using
ElementCompute
=
typename
Gemm
::
GemmKernel
::
Epilogue
::
OutputOp
::
ElementCompute
;
using
ElementScalingFactor
=
typename
Gemm
::
EpilogueOutputOp
::
ElementScalingFactor
;
static
bool
const
kScaleAux
=
Gemm
::
EpilogueOutputOp
::
kIsScalingAndAmaxAuxOutputNeeded
;
static
bool
const
kScaleOutput
=
Gemm
::
EpilogueOutputOp
::
kIsScalingAndAmaxOutputNeeded
;
/// Initialization
hytlass
::
Distribution
::
Kind
init_A
;
hytlass
::
Distribution
::
Kind
init_B
;
hytlass
::
Distribution
::
Kind
init_C
;
uint64_t
seed
;
hytlass
::
HostTensor
<
typename
Gemm
::
ElementA
,
typename
Gemm
::
LayoutA
>
tensor_A
;
hytlass
::
HostTensor
<
typename
Gemm
::
ElementB
,
typename
Gemm
::
LayoutB
>
tensor_B
;
hytlass
::
HostTensor
<
typename
Gemm
::
ElementC
,
typename
Gemm
::
LayoutC
>
tensor_C
;
hytlass
::
HostTensor
<
typename
Gemm
::
EpilogueOutputOp
::
ElementAuxOutput
,
typename
Gemm
::
LayoutC
>
tensor_Aux
;
hytlass
::
HostTensor
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
,
typename
Gemm
::
LayoutC
>
tensor_D
;
hytlass
::
HostTensor
<
typename
Gemm
::
ElementC
,
typename
Gemm
::
LayoutC
>
tensor_Vector
;
hytlass
::
HostTensor
<
ElementAccumulator
,
typename
Gemm
::
LayoutC
>
tmp_D
;
hytlass
::
HostTensor
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
,
typename
Gemm
::
LayoutC
>
reference_D
;
hytlass
::
HostTensor
<
typename
Gemm
::
EpilogueOutputOp
::
ElementAuxOutput
,
typename
Gemm
::
LayoutC
>
reference_Aux
;
hytlass
::
HostTensor
<
ElementScalingFactor
,
typename
Gemm
::
LayoutC
>
scale_A
;
hytlass
::
HostTensor
<
ElementScalingFactor
,
typename
Gemm
::
LayoutC
>
scale_B
;
hytlass
::
HostTensor
<
ElementScalingFactor
,
typename
Gemm
::
LayoutC
>
scale_C
;
hytlass
::
HostTensor
<
ElementScalingFactor
,
typename
Gemm
::
LayoutC
>
scale_D
;
hytlass
::
HostTensor
<
ElementScalingFactor
,
typename
Gemm
::
LayoutC
>
scale_Aux
;
hytlass
::
HostTensor
<
ElementAbsmax
,
typename
Gemm
::
LayoutC
>
abs_max_Aux
;
hytlass
::
HostTensor
<
ElementAbsmax
,
typename
Gemm
::
LayoutC
>
abs_max_D
;
hytlass
::
HostTensor
<
ElementAbsmax
,
typename
Gemm
::
LayoutC
>
reference_abs_max_Aux
;
hytlass
::
HostTensor
<
ElementAbsmax
,
typename
Gemm
::
LayoutC
>
reference_abs_max_D
;
//
// Methods
//
TestbedRunner
(
bool
scaleA
=
true
,
bool
scaleB
=
true
,
bool
scaleC
=
true
,
hytlass
::
Distribution
::
Kind
init_A_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_B_
=
hytlass
::
Distribution
::
Uniform
,
hytlass
::
Distribution
::
Kind
init_C_
=
hytlass
::
Distribution
::
Uniform
,
uint64_t
seed_
=
2080
)
:
init_A
(
init_A_
),
init_B
(
init_B_
),
init_C
(
init_C_
),
seed
(
seed_
)
{
}
/// Helper to initialize scaling factors
template
<
typename
Element
,
typename
Layout
>
bool
initialize_scale_factor
(
hytlass
::
TensorView
<
Element
,
Layout
>
view
,
uint64_t
seed
,
int
bits
=
0
)
{
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
view
,
seed
,
double
(
1.
),
double
(
0.
),
bits
);
return
true
;
}
/// Helper to initialize a tensor view
template
<
typename
Element
,
typename
Layout
>
bool
initialize_tensor
(
hytlass
::
TensorView
<
Element
,
Layout
>
view
,
hytlass
::
Distribution
::
Kind
dist_kind
,
uint64_t
seed
)
{
if
(
dist_kind
==
hytlass
::
Distribution
::
Uniform
)
{
double
scope_max
,
scope_min
;
int
bits_input
=
hytlass
::
sizeof_bits
<
Element
>::
value
;
int
bits_output
=
hytlass
::
sizeof_bits
<
typename
Gemm
::
ElementC
>::
value
;
if
(
bits_input
==
1
)
{
scope_max
=
2
;
scope_min
=
0
;
}
else
if
(
bits_input
<=
8
)
{
scope_max
=
2
;
scope_min
=
-
2
;
}
else
if
(
bits_output
==
16
)
{
scope_max
=
5
;
scope_min
=
-
5
;
}
else
{
scope_max
=
8
;
scope_min
=
-
8
;
}
hytlass
::
reference
::
host
::
TensorFillRandomUniform
(
view
,
seed
,
scope_max
,
scope_min
,
0
);
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Identity
)
{
hytlass
::
reference
::
host
::
TensorFillIdentity
(
view
);
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Gaussian
)
{
hytlass
::
reference
::
host
::
TensorFillRandomGaussian
(
view
,
seed
,
0
,
0.5
);
}
else
if
(
dist_kind
==
hytlass
::
Distribution
::
Sequential
)
{
hytlass
::
reference
::
host
::
BlockFillSequential
(
view
.
data
(),
view
.
capacity
());
}
else
{
std
::
cerr
<<
"Not implemented"
;
return
false
;
}
return
true
;
}
/// Initializes data structures
void
initialize
(
const
Options
&
options
)
{
//
// Allocate the GEMM workspace
//
tensor_A
.
resize
(
options
.
problem_size
.
mk
());
tensor_B
.
resize
(
options
.
problem_size
.
kn
());
tensor_C
.
resize
(
options
.
problem_size
.
mn
());
tensor_D
.
resize
(
options
.
problem_size
.
mn
());
tensor_Vector
.
resize
({
1
,
options
.
problem_size
.
n
()});
reference_D
.
resize
(
options
.
problem_size
.
mn
(),
false
);
tmp_D
.
resize
(
options
.
problem_size
.
mn
(),
false
);
initialize_tensor
(
tensor_A
.
host_view
(),
init_A
,
seed
+
2019
);
initialize_tensor
(
tensor_B
.
host_view
(),
init_B
,
seed
+
2018
);
initialize_tensor
(
tensor_C
.
host_view
(),
init_C
,
seed
+
2017
);
initialize_tensor
(
tensor_Vector
.
host_view
(),
init_C
,
seed
+
2020
);
// It is possible to randomly initialize to all zeros, so override this with non-zeros
// in the upper left corner of each operand.
hytlass
::
Coord
<
2
>
origin
(
0
);
tensor_A
.
host_view
().
at
(
origin
)
=
typename
Gemm
::
ElementA
(
1
);
tensor_B
.
host_view
().
at
(
origin
)
=
typename
Gemm
::
ElementB
(
1
);
tensor_C
.
host_view
().
at
(
origin
)
=
typename
Gemm
::
ElementC
(
1
);
tensor_Vector
.
host_view
().
at
(
origin
)
=
typename
Gemm
::
ElementC
(
1
);
hytlass
::
reference
::
host
::
TensorFill
(
tensor_D
.
host_view
());
hytlass
::
reference
::
host
::
TensorCopy
(
reference_D
.
host_view
(),
tensor_C
.
host_view
());
tensor_A
.
sync_device
();
tensor_B
.
sync_device
();
tensor_C
.
sync_device
();
tensor_D
.
sync_device
();
tensor_Vector
.
sync_device
();
int
scale_bits
=
2
;
if
(
options
.
scale_A
)
{
scale_A
.
resize
({
1
,
1
});
initialize_scale_factor
(
scale_A
.
host_view
(),
seed
+
2021
,
scale_bits
);
scale_A
.
sync_device
();
}
if
(
options
.
scale_B
)
{
scale_B
.
resize
({
1
,
1
});
initialize_scale_factor
(
scale_B
.
host_view
(),
seed
+
2022
,
scale_bits
);
scale_B
.
sync_device
();
}
if
(
options
.
scale_C
)
{
scale_C
.
resize
({
1
,
1
});
initialize_scale_factor
(
scale_C
.
host_view
(),
seed
+
2023
,
scale_bits
);
scale_C
.
sync_device
();
}
if
(
kScaleOutput
)
{
scale_D
.
resize
({
1
,
1
});
initialize_scale_factor
(
scale_D
.
host_view
(),
seed
+
2024
,
scale_bits
);
scale_D
.
sync_device
();
abs_max_D
.
resize
({
1
,
1
});
hytlass
::
reference
::
host
::
TensorFill
(
abs_max_D
.
host_view
());
abs_max_D
.
sync_device
();
reference_abs_max_D
.
resize
({
1
,
1
});
}
if
(
kScaleAux
)
{
tensor_Aux
.
resize
(
options
.
problem_size
.
mn
());
hytlass
::
reference
::
host
::
TensorFill
(
tensor_Aux
.
host_view
());
tensor_Aux
.
sync_device
();
scale_Aux
.
resize
({
1
,
1
});
initialize_scale_factor
(
scale_Aux
.
host_view
(),
seed
+
2025
,
scale_bits
);
scale_Aux
.
sync_device
();
abs_max_Aux
.
resize
({
1
,
1
});
hytlass
::
reference
::
host
::
TensorFill
(
abs_max_Aux
.
host_view
());
abs_max_Aux
.
sync_device
();
reference_Aux
.
resize
(
options
.
problem_size
.
mn
(),
false
);
reference_abs_max_Aux
.
resize
({
1
,
1
});
}
}
/// Compares computed reference with device reference and outputs to a file if incorrect
bool
compare_reference
(
const
Options
&
options
)
{
tensor_D
.
sync_host
();
bool
passed
=
hytlass
::
reference
::
host
::
TensorEquals
(
reference_D
.
host_view
(),
tensor_D
.
host_view
());
if
(
kScaleAux
)
{
tensor_Aux
.
sync_host
();
abs_max_Aux
.
sync_host
();
passed
&=
hytlass
::
reference
::
host
::
TensorEquals
(
reference_Aux
.
host_view
(),
tensor_Aux
.
host_view
());
passed
&=
hytlass
::
reference
::
host
::
TensorEquals
(
abs_max_Aux
.
host_view
(),
reference_abs_max_Aux
.
host_view
());
}
if
(
kScaleOutput
)
{
abs_max_D
.
sync_host
();
passed
&=
hytlass
::
reference
::
host
::
TensorEquals
(
abs_max_D
.
host_view
(),
reference_abs_max_D
.
host_view
());
}
if
(
!
passed
)
{
std
::
cerr
<<
"Reference check failed"
<<
std
::
endl
;
std
::
string
output_file
=
"testbed_with_amax_errors.txt"
;
std
::
ofstream
file
(
output_file
);
file
<<
"problem: "
<<
options
.
problem_size
<<
", alpha: "
<<
options
.
alpha
<<
", beta: "
<<
options
.
beta
<<
"
\n\n
"
;
file
<<
"A =
\n
"
<<
tensor_A
.
host_view
()
<<
"
\n
B =
\n
"
<<
tensor_B
.
host_view
()
<<
"
\n
C =
\n
"
<<
tensor_C
.
host_view
()
<<
"
\n
Vector =
\n
"
<<
tensor_Vector
.
host_view
()
<<
"
\n
ScaleA = "
<<
scale_A
.
host_view
()
<<
"
\n
ScaleB = "
<<
scale_B
.
host_view
()
<<
"
\n
ScaleC = "
<<
scale_C
.
host_view
()
<<
"
\n
ScaleD = "
<<
scale_D
.
host_view
()
<<
"
\n
ScaleAux = "
<<
scale_Aux
.
host_view
()
<<
"
\n\n
Reference D =
\n
"
<<
reference_D
.
host_view
()
<<
"
\n
Computed D =
\n
"
<<
tensor_D
.
host_view
();
if
(
kScaleAux
)
{
file
<<
"
\n\n
Reference Aux =
\n
"
<<
reference_Aux
.
host_view
()
<<
"
\n
Computed Aux =
\n
"
<<
tensor_Aux
.
host_view
()
<<
"
\n\n
Reference Absmax Aux = "
<<
reference_abs_max_Aux
.
host_view
()
<<
"
\n
Computed Absmax Aux = "
<<
abs_max_Aux
.
host_view
();
}
if
(
kScaleOutput
)
{
file
<<
"
\n\n
Reference Absmax D = "
<<
reference_abs_max_D
.
host_view
()
<<
"
\n
Computed Absmax D = "
<<
abs_max_D
.
host_view
();
}
std
::
cerr
<<
"Dumped results to "
<<
output_file
<<
std
::
endl
;
}
return
passed
;
}
/// Verifies the result is a GEMM
bool
verify
(
const
Options
&
options
)
{
hytlass
::
Coord
<
2
>
origin
(
0
);
ElementCompute
scaled_alpha
=
options
.
alpha
;
if
(
options
.
scale_A
)
{
scaled_alpha
*=
scale_A
.
host_view
().
at
(
origin
);
}
if
(
options
.
scale_B
)
{
scaled_alpha
*=
scale_B
.
host_view
().
at
(
origin
);
}
ElementCompute
scaled_beta
=
options
.
beta
;
if
(
options
.
scale_C
)
{
scaled_beta
*=
scale_C
.
host_view
().
at
(
origin
);
}
//
// Verify
//
hytlass
::
reference
::
host
::
GemmComplex
<
typename
Gemm
::
ElementA
,
typename
Gemm
::
LayoutA
,
typename
Gemm
::
ElementB
,
typename
Gemm
::
LayoutB
,
typename
Gemm
::
ElementC
,
typename
Gemm
::
LayoutC
,
ElementCompute
,
ElementAccumulator
,
ElementAccumulator
>
(
options
.
problem_size
,
scaled_alpha
,
tensor_A
.
host_ref
(),
Gemm
::
kTransformA
,
tensor_B
.
host_ref
(),
Gemm
::
kTransformB
,
scaled_beta
,
tensor_C
.
host_ref
(),
tmp_D
.
host_ref
(),
ElementAccumulator
(
0
)
);
ElementCompute
tmp_abs_max_Aux
(
0.
);
ElementCompute
tmp_abs_max_D
(
0.
);
hytlass
::
NumericConverter
<
ElementCompute
,
typename
Gemm
::
ElementC
>
cvt_c_to_compute
;
hytlass
::
NumericConverter
<
ElementCompute
,
ElementAccumulator
>
cvt_accum_to_compute
;
hytlass
::
NumericConverter
<
ElementAccumulator
,
ElementCompute
>
cvt_compute_to_accum
;
hytlass
::
NumericConverter
<
typename
Gemm
::
EpilogueOutputOp
::
ElementOutput
,
ElementCompute
>
cvt_compute_to_d
;
hytlass
::
NumericConverter
<
typename
Gemm
::
EpilogueOutputOp
::
ElementAuxOutput
,
ElementCompute
>
cvt_compute_to_aux
;
hytlass
::
absolute_value_op
<
ElementCompute
>
abs
;
hytlass
::
maximum_with_nan_propogation
<
ElementCompute
>
max
;
hytlass
::
epilogue
::
thread
::
ReLu
<
ElementCompute
>
act
;
ElementScalingFactor
d_scale
=
kScaleOutput
?
scale_D
.
host_view
().
at
(
origin
)
:
ElementScalingFactor
(
1.
);
for
(
int
m
=
0
;
m
<
options
.
problem_size
.
m
();
++
m
)
{
for
(
int
n
=
0
;
n
<
options
.
problem_size
.
n
();
++
n
)
{
ElementCompute
intermediate
=
cvt_accum_to_compute
(
tmp_D
.
host_view
().
at
({
m
,
n
}));
ElementCompute
bias
=
cvt_c_to_compute
(
tensor_Vector
.
host_view
().
at
({
0
,
n
}));
ElementCompute
aux
=
intermediate
+
bias
;
ElementCompute
d
=
act
(
aux
);
tmp_abs_max_Aux
=
max
(
abs
(
aux
),
tmp_abs_max_Aux
);
tmp_abs_max_D
=
max
(
abs
(
d
),
tmp_abs_max_D
);
reference_D
.
host_view
().
at
({
m
,
n
})
=
cvt_compute_to_d
(
d
*
d_scale
);
if
(
kScaleAux
)
{
reference_Aux
.
host_view
().
at
({
m
,
n
})
=
cvt_compute_to_aux
(
aux
*
scale_Aux
.
host_view
().
at
(
origin
));
}
}
}
if
(
kScaleAux
)
{
reference_abs_max_Aux
.
host_view
().
at
(
origin
)
=
cvt_compute_to_accum
(
tmp_abs_max_Aux
);
}
if
(
kScaleOutput
)
{
reference_abs_max_D
.
host_view
().
at
(
origin
)
=
cvt_compute_to_accum
(
tmp_abs_max_D
);
}
return
compare_reference
(
options
);
}
/// Returns true if the hip device is sufficient to execute the kernel.
bool
sufficient
()
const
{
size_t
smem_size
=
sizeof
(
typename
Gemm
::
GemmKernel
::
SharedStorage
);
hipDeviceProp_t
properties
;
int
device_idx
;
hipError_t
result
=
hipGetDevice
(
&
device_idx
);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipGetDevice() failed with error: "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
result
=
hipGetDeviceProperties
(
&
properties
,
device_idx
);
if
(
result
!=
hipSuccess
)
{
std
::
cerr
<<
"hipGetDeviceProperties() failed with error: "
<<
hipGetErrorString
(
result
)
<<
std
::
endl
;
return
false
;
}
return
true
;
}
/// Executes one test
bool
run
(
Options
&
options
)
{
// Waive test if insufficient hip device
if
(
!
sufficient
())
{
std
::
cerr
<<
"Insufficient resources to run the kernel."
<<
std
::
endl
;
return
false
;
}
this
->
initialize
(
options
);
//
// Initialize the GEMM operator
//
typename
Gemm
::
EpilogueOutputOp
::
Params
::
ActivationParams
activation_params
{
ElementCompute
(
options
.
alpha
),
ElementCompute
(
options
.
beta
)
};
typename
Gemm
::
EpilogueOutputOp
::
Params
epilogue_params
{
activation_params
,
scale_A
.
device_data
(),
scale_B
.
device_data
(),
scale_C
.
device_data
(),
scale_D
.
device_data
(),
scale_Aux
.
device_data
(),
abs_max_Aux
.
device_data
(),
abs_max_D
.
device_data
()
};
typename
Gemm
::
Arguments
arguments
{
hytlass
::
gemm
::
GemmUniversalMode
::
kGemm
,
options
.
problem_size
,
/* batch_count = */
1
,
epilogue_params
,
tensor_A
.
device_data
(),
tensor_B
.
device_data
(),
tensor_C
.
device_data
(),
tensor_D
.
device_data
(),
tensor_Aux
.
device_data
(),
tensor_Vector
.
device_data
(),
options
.
problem_size
.
m
()
*
options
.
problem_size
.
k
(),
options
.
problem_size
.
n
()
*
options
.
problem_size
.
k
(),
options
.
problem_size
.
m
()
*
options
.
problem_size
.
n
(),
options
.
problem_size
.
m
()
*
options
.
problem_size
.
n
(),
(
int
)
options
.
problem_size
.
m
(),
// Batch stride vector
tensor_A
.
layout
().
stride
(
0
),
tensor_B
.
layout
().
stride
(
0
),
tensor_C
.
layout
().
stride
(
0
),
tensor_D
.
layout
().
stride
(
0
),
(
int64_t
)
0
// Leading dimension of vector. This must be 0
};
Gemm
gemm_op
;
hytlass
::
Status
status
=
gemm_op
.
can_implement
(
arguments
);
if
(
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Gemm::can_implement() failed"
<<
std
::
endl
;
return
false
;
}
size_t
workspace_size
=
Gemm
::
get_workspace_size
(
arguments
);
hytlass
::
device_memory
::
allocation
<
uint8_t
>
workspace
(
workspace_size
);
status
=
gemm_op
.
initialize
(
arguments
,
workspace
.
get
());
if
(
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Gemm::initialize() failed"
<<
std
::
endl
;
return
false
;
}
//
// Run the GEMM
//
status
=
gemm_op
();
if
(
status
!=
hytlass
::
Status
::
kSuccess
)
{
std
::
cerr
<<
"Gemm::run() failed"
<<
std
::
endl
;
return
false
;
}
hipError_t
hip_error
=
hipDeviceSynchronize
();
if
(
hip_error
!=
hipSuccess
)
{
std
::
cerr
<<
"hip error: "
<<
hipGetErrorString
(
hip_error
)
<<
std
::
endl
;
return
false
;
}
//
// Verify
//
bool
passed
=
true
;
if
(
options
.
reference_check
)
{
passed
&=
this
->
verify
(
options
);
}
else
{
std
::
cout
<<
"Skipped reference check"
<<
std
::
endl
;
}
//
// Warm up
//
for
(
int
i
=
0
;
i
<
options
.
warmup_iterations
;
++
i
)
{
gemm_op
();
}
//
// Profile
//
hipEvent_t
events
[
2
];
hipError_t
error
;
for
(
auto
&
event
:
events
)
{
error
=
hipEventCreate
(
&
event
);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventCreate() failed: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
false
;
}
}
// Record an event at the start of a series of GEMM operations
error
=
hipEventRecord
(
events
[
0
]);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
false
;
}
// Run profiling loop
for
(
int
iter
=
0
;
iter
<
options
.
iterations
;
++
iter
)
{
gemm_op
();
}
// Record an event when the GEMM operations have been launched.
error
=
hipEventRecord
(
events
[
1
]);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventRecord() failed: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
false
;
}
// Wait for work on the device to complete.
error
=
hipEventSynchronize
(
events
[
1
]);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventSynchronize() failed: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
false
;
}
// Measure elapsed runtime
float
runtime_ms
=
0
;
error
=
hipEventElapsedTime
(
&
runtime_ms
,
events
[
0
],
events
[
1
]);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipEventElapsed() failed: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
false
;
}
// Compute average runtime and GFLOPs.
runtime_ms
=
runtime_ms
/
float
(
options
.
iterations
);
float
gflops
=
options
.
gflops
(
runtime_ms
/
1000.0
f
);
std
::
cout
<<
"Problem size: "
<<
options
.
problem_size
.
m
()
<<
'x'
<<
options
.
problem_size
.
n
()
<<
'x'
<<
options
.
problem_size
.
k
()
<<
std
::
endl
;
std
::
cout
<<
"Runtime (ms): "
<<
runtime_ms
<<
std
::
endl
;
std
::
cout
<<
"GFLOPs/sec: "
<<
gflops
<<
std
::
endl
;
// Cleanup
for
(
auto
event
:
events
)
{
(
void
)
hipEventDestroy
(
event
);
}
return
passed
;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
int
main
(
int
argc
,
char
const
**
argv
)
{
hipDeviceProp_t
props
;
hipError_t
error
=
hipGetDeviceProperties
(
&
props
,
0
);
if
(
error
!=
hipSuccess
)
{
std
::
cerr
<<
"hipGetDeviceProperties() returned an error: "
<<
hipGetErrorString
(
error
)
<<
std
::
endl
;
return
-
1
;
}
//
// Parse options
//
Options
options
;
options
.
parse
(
argc
,
argv
);
if
(
options
.
help
)
{
options
.
print_usage
(
std
::
cout
)
<<
std
::
endl
;
return
0
;
}
if
(
options
.
error
)
{
std
::
cerr
<<
"Aborting execution."
<<
std
::
endl
;
return
-
1
;
}
std
::
cout
<<
"Running GEMM with staged accumulation (OpMultiplyAdd)"
<<
std
::
endl
;
std
::
cout
<<
"====================================================="
<<
std
::
endl
;
TestbedRunner
<
Gemm_
<
hytlass
::
arch
::
OpMultiplyAdd
>>
testbed_staged_accum
;
bool
passed
=
testbed_staged_accum
.
run
(
options
);
if
(
passed
)
{
std
::
cout
<<
"Passed"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Failed"
<<
std
::
endl
;
}
return
0
;
}
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment