Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8a370fbb
Commit
8a370fbb
authored
Jul 26, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into group_conv
parents
d8fdd226
85978e02
Changes
44
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2721 additions
and
213 deletions
+2721
-213
Jenkinsfile
Jenkinsfile
+183
-82
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+56
-45
example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
...atched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
+38
-32
example/28_grouped_gemm_bias/CMakeLists.txt
example/28_grouped_gemm_bias/CMakeLists.txt
+1
-0
example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
+280
-0
example/29_batched_gemm_multi_d/CMakeLists.txt
example/29_batched_gemm_multi_d/CMakeLists.txt
+3
-0
example/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
...le/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
+248
-0
example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
+217
-0
example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
example/30_grouped_convnd_fwd_bias_relu/CMakeLists.txt
+0
-0
example/30_grouped_convnd_fwd_bias_relu/README.md
example/30_grouped_convnd_fwd_bias_relu/README.md
+0
-0
example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
...d_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
+0
-0
example/30_grouped_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
...d_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
+0
-0
example/CMakeLists.txt
example/CMakeLists.txt
+3
-1
include/ck/tensor_operation/gpu/device/device_batched_gemm_c_permute_xdl.hpp
...peration/gpu/device/device_batched_gemm_c_permute_xdl.hpp
+876
-0
include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
...or_operation/gpu/device/device_batched_gemm_e_permute.hpp
+11
-3
include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
...peration/gpu/device/device_batched_gemm_e_permute_xdl.hpp
+29
-15
include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
...nsor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+58
-0
include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
..._operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
+713
-0
include/ck/tensor_operation/gpu/device/device_gemm.hpp
include/ck/tensor_operation/gpu/device/device_gemm.hpp
+0
-29
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
...ration/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+5
-6
No files found.
Jenkinsfile
View file @
8a370fbb
...
@@ -11,6 +11,12 @@ def show_node_info() {
...
@@ -11,6 +11,12 @@ def show_node_info() {
"""
"""
}
}
def
runShell
(
String
command
){
def
responseCode
=
sh
returnStatus:
true
,
script:
"${command} &> tmp.txt"
def
output
=
readFile
(
file:
"tmp.txt"
)
return
(
output
!=
""
)
}
def
cmake_build
(
Map
conf
=[:]){
def
cmake_build
(
Map
conf
=[:]){
def
compiler
=
conf
.
get
(
"compiler"
,
"/opt/rocm/bin/hipcc"
)
def
compiler
=
conf
.
get
(
"compiler"
,
"/opt/rocm/bin/hipcc"
)
...
@@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){
...
@@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){
"""
"""
def
setup_cmd
=
conf
.
get
(
"setup_cmd"
,
"${cmake_envs} cmake ${setup_args} .. "
)
def
setup_cmd
=
conf
.
get
(
"setup_cmd"
,
"${cmake_envs} cmake ${setup_args} .. "
)
// reduce parallelism when compiling, clang uses too much memory
// reduce parallelism when compiling, clang uses too much memory
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) /
1
)) ${config_targets}"
)
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) /
2
)) ${config_targets}"
)
def
execute_cmd
=
conf
.
get
(
"execute_cmd"
,
""
)
def
execute_cmd
=
conf
.
get
(
"execute_cmd"
,
""
)
def
cmd
=
conf
.
get
(
"cmd"
,
"""
def
cmd
=
conf
.
get
(
"cmd"
,
"""
...
@@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
echo
"GPU not found"
throw
e
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
}
...
@@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
" --no-cache ."
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
" --no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
echo
"GPU not found"
throw
e
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
}
...
@@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
5
,
unit:
'HOURS'
)
timeout
(
time:
5
,
unit:
'HOURS'
)
{
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
echo
"GPU not found"
throw
e
}
else
{
echo
"GPU is OK"
}
cmake_build
(
conf
)
cmake_build
(
conf
)
}
}
}
}
...
@@ -145,7 +172,6 @@ def reboot(){
...
@@ -145,7 +172,6 @@ def reboot(){
build
job:
'reboot-slaves'
,
propagate:
false
,
parameters:
[
string
(
name:
'server'
,
value:
"${env.NODE_NAME}"
),]
build
job:
'reboot-slaves'
,
propagate:
false
,
parameters:
[
string
(
name:
'server'
,
value:
"${env.NODE_NAME}"
),]
}
}
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
try
{
try
{
buildHipClangJob
(
conf
)
buildHipClangJob
(
conf
)
...
@@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
...
@@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
}
}
}
}
def
runCKProfiler
(
Map
conf
=[:]){
def
runCKProfiler
(
Map
conf
=[:]){
show_node_info
()
show_node_info
()
...
@@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){
...
@@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){
}
}
def
variant
=
env
.
STAGE_NAME
def
variant
=
env
.
STAGE_NAME
def
retimage
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
...
@@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){
...
@@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
echo
"GPU not found"
throw
e
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
}
...
@@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){
...
@@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
" --no-cache ."
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
" --no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
echo
"GPU not found"
throw
e
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
5
,
unit:
'HOURS'
)
timeout
(
time:
24
,
unit:
'HOURS'
)
{
{
cmake_build
(
conf
)
cmake_build
(
conf
)
dir
(
"script"
){
dir
(
"script"
){
//run gemm performance tests
if
(
params
.
RUN_FULL_QA
){
def
gemm_log
=
"perf_gemm_${gpu_arch}.log"
def
qa_log
=
"qa_${gpu_arch}.log"
sh
"rm -f ${gemm_log}"
if
(
params
.
USE_9110
){
sh
"echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
sh
"./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh
"echo Node name: ${NODE_NAME} >> ${gemm_log}"
}
sh
"echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
else
{
sh
"rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
sh
"./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh
"hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
}
if
(
params
.
USE_9110
){
archiveArtifacts
"perf_gemm_${gpu_arch}.log"
sh
"echo Environment type: CI_9110 >> ${gemm_log}"
archiveArtifacts
"perf_resnet50_N256_${gpu_arch}.log"
}
archiveArtifacts
"perf_resnet50_N4_${gpu_arch}.log"
else
{
archiveArtifacts
"perf_bathced_gemm_${gpu_arch}.log"
sh
"echo Environment type: CI_release >> ${gemm_log}"
archiveArtifacts
"perf_grouped_gemm_${gpu_arch}.log"
}
archiveArtifacts
"perf_fwd_conv_${gpu_arch}.log"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
archiveArtifacts
"perf_bwd_conv_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
archiveArtifacts
"perf_fusion_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
archiveArtifacts
"perf_reduction_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
// stash perf files to master
sh
"./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_gemm_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_resnet50_N4_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_bathced_gemm_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_grouped_gemm_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_fwd_conv_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_bwd_conv_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_fusion_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_reduction_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
//we will process results on the master node
sh
"./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
//results will be parsed, stored, and analyzed within the python script
//the script will return 0 if the performance criteria are met
//or return 1 if the criteria are not met
archiveArtifacts
"${gemm_log}"
sh
"python3 process_perf_data.py ${gemm_log} "
//run resnet50 test
def
resnet256_log
=
"perf_resnet50_N256_${gpu_arch}.log"
sh
"rm -f ${resnet256_log}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet256_log}"
sh
"echo Node name: ${NODE_NAME} >> ${resnet256_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet256_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet256_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet256_log}"
if
(
params
.
USE_9110
){
sh
"echo Environment type: CI_9110 >> ${resnet256_log}"
}
}
else
{
else
{
sh
"echo Environment type: CI_release >> ${resnet256_log}"
if
(
params
.
USE_9110
){
sh
"./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
}
else
{
sh
"./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
}
archiveArtifacts
"perf_gemm_${gpu_arch}.log"
archiveArtifacts
"perf_resnet50_N256_${gpu_arch}.log"
archiveArtifacts
"perf_resnet50_N4_${gpu_arch}.log"
// stash perf files to master
stash
name:
"perf_gemm_${gpu_arch}.log"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
stash
name:
"perf_resnet50_N4_${gpu_arch}.log"
//we will process the results on the master node
}
}
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet256_log}"
//first run tests with N=256
sh
"./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet256_log}"
archiveArtifacts
"${resnet256_log}"
sh
"python3 process_perf_data.py ${resnet256_log} "
//then run with N=4
def
resnet4_log
=
"perf_resnet50_N4_${gpu_arch}.log"
sh
"rm -f ${resnet4_log}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet4_log}"
sh
"echo Node name: ${NODE_NAME} >> ${resnet4_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet4_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet4_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet4_log}"
if
(
params
.
USE_9110
){
sh
"echo Environment type: CI_9110 >> ${resnet4_log}"
}
else
{
sh
"echo Environment type: CI_release >> ${resnet4_log}"
}
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet4_log}"
sh
"./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet4_log}"
archiveArtifacts
"${resnet4_log}"
sh
"python3 process_perf_data.py ${resnet4_log} "
}
}
}
}
}
}
...
@@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){
...
@@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){
return
retimage
return
retimage
}
}
def
runPerfTest
(
Map
conf
=[:]){
def
runPerfTest
(
Map
conf
=[:]){
try
{
try
{
runCKProfiler
(
conf
)
runCKProfiler
(
conf
)
...
@@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){
...
@@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){
}
}
}
}
def
process_results
(
Map
conf
=[:]){
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
def
image
=
"composable_kernels"
def
prefixpath
=
"/opt/rocm"
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
// Jenkins is complaining about the render group
def
dockerOpts
=
"--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
def
variant
=
env
.
STAGE_NAME
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
}
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
echo
"The job was cancelled or aborted"
throw
e
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
1
,
unit:
'HOURS'
){
try
{
dir
(
"script"
){
if
(
params
.
RUN_FULL_QA
){
// unstash perf files to master
unstash
"perf_gemm_${gpu_arch}.log"
unstash
"perf_resnet50_N256_${gpu_arch}.log"
unstash
"perf_resnet50_N4_${gpu_arch}.log"
unstash
"perf_bathced_gemm_${gpu_arch}.log"
unstash
"perf_grouped_gemm_${gpu_arch}.log"
unstash
"perf_fwd_conv_${gpu_arch}.log"
unstash
"perf_bwd_conv_${gpu_arch}.log"
unstash
"perf_fusion_${gpu_arch}.log"
unstash
"perf_reduction_${gpu_arch}.log"
sh
"./process_qa_data.sh ${gpu_arch}"
}
else
{
// unstash perf files to master
unstash
"perf_gemm_${gpu_arch}.log"
unstash
"perf_resnet50_N256_${gpu_arch}.log"
unstash
"perf_resnet50_N4_${gpu_arch}.log"
sh
"./process_perf_data.sh ${gpu_arch}"
}
}
}
catch
(
e
){
echo
"throwing error exception while processing performance test results"
echo
'Exception occurred: '
+
e
.
toString
()
throw
e
}
}
}
}
//launch develop branch daily at 23:00 in FULL_QA mode
//CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : ""
pipeline
{
pipeline
{
agent
none
agent
none
//triggers {
// cron(CRON_SETTINGS)
//}
options
{
options
{
parallelsAlwaysFailFast
()
parallelsAlwaysFailFast
()
}
}
...
@@ -325,7 +403,11 @@ pipeline {
...
@@ -325,7 +403,11 @@ pipeline {
booleanParam
(
booleanParam
(
name:
"USE_9110"
,
name:
"USE_9110"
,
defaultValue:
true
,
defaultValue:
true
,
description:
""
)
description:
"Select compiler version: 9110 (default) or release"
)
booleanParam
(
name:
"RUN_FULL_QA"
,
defaultValue:
false
,
description:
"Select whether to run small set of performance tests (default) or full QA"
)
}
}
environment
{
environment
{
dbuser
=
"${dbuser}"
dbuser
=
"${dbuser}"
...
@@ -438,6 +520,25 @@ pipeline {
...
@@ -438,6 +520,25 @@ pipeline {
}
}
}
}
}
}
stage
(
"Process Performance Test Results"
)
{
parallel
{
stage
(
"Process results for gfx908"
){
agent
{
label
'mici'
}
steps
{
process_results
(
gpu_arch:
"gfx908"
)
}
}
stage
(
"Process results for gfx90a"
){
agent
{
label
'mici'
}
steps
{
process_results
(
gpu_arch:
"gfx90a"
)
}
}
}
}
/* enable after the cmake file supports packaging
/* enable after the cmake file supports packaging
stage("Packages") {
stage("Packages") {
when {
when {
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
8a370fbb
...
@@ -29,34 +29,40 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
...
@@ -29,34 +29,40 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
ck
::
half_t
;
using
ADataType
=
F16
;
using
BDataType
=
ck
::
half_t
;
using
BDataType
=
F16
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
F32
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
F16
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
EDataType
=
F16
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DsLayout
=
ck
::
Tuple
<>
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ELayout
=
Row
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// static constexpr auto GemmMNPadding =
// ck::tensor_operation::device::GemmSpecialization::MNPadding;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemm_Xdl
// clang-format off
// clang-format off
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmXdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| Num|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| Prefetch|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| |
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmDefault
,
256
,
256
,
128
,
4
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
7
,
1
,
1
>
;
// clang-format on
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
BDataType
,
EDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
...
@@ -81,11 +87,11 @@ int main(int argc, char* argv[])
...
@@ -81,11 +87,11 @@ int main(int argc, char* argv[])
int
group_count
=
rand
()
%
16
+
1
;
int
group_count
=
rand
()
%
16
+
1
;
// GEMM shape
// GEMM shape
std
::
vector
<
ck
::
tensor_operation
::
device
::
Gemm
Shape
>
gemm_
shape
s
;
std
::
vector
<
ck
::
tensor_operation
::
device
::
Gemm
Desc
>
gemm_
desc
s
;
std
::
vector
<
const
void
*>
p_a
,
p_b
;
std
::
vector
<
const
void
*>
p_a
,
p_b
;
std
::
vector
<
void
*>
p_c
;
std
::
vector
<
void
*>
p_c
;
gemm_
shape
s
.
reserve
(
group_count
);
gemm_
desc
s
.
reserve
(
group_count
);
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
{
{
...
@@ -93,7 +99,11 @@ int main(int argc, char* argv[])
...
@@ -93,7 +99,11 @@ int main(int argc, char* argv[])
int
N
=
128
+
128
*
i
;
int
N
=
128
+
128
*
i
;
int
K
=
64
+
64
*
i
;
int
K
=
64
+
64
*
i
;
gemm_shapes
.
push_back
({
M
,
N
,
K
,
K
,
K
,
N
});
int
stride_A
=
K
;
int
stride_B
=
K
;
int
stride_C
=
N
;
gemm_descs
.
push_back
({
M
,
N
,
K
,
stride_A
,
stride_B
,
stride_C
,
{}});
}
}
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
...
@@ -111,10 +121,9 @@ int main(int argc, char* argv[])
...
@@ -111,10 +121,9 @@ int main(int argc, char* argv[])
};
};
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
;
std
::
vector
<
Tensor
<
BDataType
>>
b_tensors
;
std
::
vector
<
Tensor
<
BDataType
>>
b_tensors
;
std
::
vector
<
Tensor
<
C
DataType
>>
c_host_tensors
;
std
::
vector
<
Tensor
<
E
DataType
>>
c_host_tensors
;
std
::
vector
<
Tensor
<
C
DataType
>>
c_device_tensors
;
std
::
vector
<
Tensor
<
E
DataType
>>
c_device_tensors
;
a_tensors
.
reserve
(
group_count
);
a_tensors
.
reserve
(
group_count
);
b_tensors
.
reserve
(
group_count
);
b_tensors
.
reserve
(
group_count
);
...
@@ -131,25 +140,25 @@ int main(int argc, char* argv[])
...
@@ -131,25 +140,25 @@ int main(int argc, char* argv[])
std
::
size_t
flop
=
0
,
num_btype
=
0
;
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_
shape
s
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_
desc
s
.
size
();
i
++
)
{
{
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
gemm_
shape
s
[
i
].
M
,
gemm_
shape
s
[
i
].
K
,
gemm_
shape
s
[
i
].
S
tride
A
,
ALayout
{})));
gemm_
desc
s
[
i
].
M
_
,
gemm_
desc
s
[
i
].
K
_
,
gemm_
desc
s
[
i
].
s
tride
_A_
,
ALayout
{})));
b_tensors
.
push_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
b_tensors
.
push_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
gemm_
shape
s
[
i
].
K
,
gemm_
shape
s
[
i
].
N
,
gemm_
shape
s
[
i
].
S
tride
B
,
BLayout
{})));
gemm_
desc
s
[
i
].
K
_
,
gemm_
desc
s
[
i
].
N
_
,
gemm_
desc
s
[
i
].
s
tride
_B_
,
BLayout
{})));
c_host_tensors
.
push_back
(
Tensor
<
C
DataType
>
(
f_host_tensor_descriptor
(
c_host_tensors
.
push_back
(
Tensor
<
E
DataType
>
(
f_host_tensor_descriptor
(
gemm_
shape
s
[
i
].
M
,
gemm_
shape
s
[
i
].
N
,
gemm_
shape
s
[
i
].
S
tride
C
,
C
Layout
{})));
gemm_
desc
s
[
i
].
M
_
,
gemm_
desc
s
[
i
].
N
_
,
gemm_
desc
s
[
i
].
s
tride
_C_
,
E
Layout
{})));
c_device_tensors
.
push_back
(
Tensor
<
C
DataType
>
(
f_host_tensor_descriptor
(
c_device_tensors
.
push_back
(
Tensor
<
E
DataType
>
(
f_host_tensor_descriptor
(
gemm_
shape
s
[
i
].
M
,
gemm_
shape
s
[
i
].
N
,
gemm_
shape
s
[
i
].
S
tride
C
,
C
Layout
{})));
gemm_
desc
s
[
i
].
M
_
,
gemm_
desc
s
[
i
].
N
_
,
gemm_
desc
s
[
i
].
s
tride
_C_
,
E
Layout
{})));
std
::
cout
<<
"gemm["
<<
i
<<
"] a_m_k: "
<<
a_tensors
[
i
].
mDesc
std
::
cout
<<
"gemm["
<<
i
<<
"] a_m_k: "
<<
a_tensors
[
i
].
mDesc
<<
" b_k_n: "
<<
b_tensors
[
i
].
mDesc
<<
" c_m_n: "
<<
c_device_tensors
[
i
].
mDesc
<<
" b_k_n: "
<<
b_tensors
[
i
].
mDesc
<<
" c_m_n: "
<<
c_device_tensors
[
i
].
mDesc
<<
std
::
endl
;
<<
std
::
endl
;
flop
+=
std
::
size_t
(
2
)
*
gemm_
shape
s
[
i
].
M
*
gemm_
shape
s
[
i
].
K
*
gemm_
shape
s
[
i
].
N
;
flop
+=
std
::
size_t
(
2
)
*
gemm_
desc
s
[
i
].
M
_
*
gemm_
desc
s
[
i
].
K
_
*
gemm_
desc
s
[
i
].
N
_
;
num_btype
+=
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()
+
num_btype
+=
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()
+
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSize
()
+
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSize
()
+
sizeof
(
C
DataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSize
();
sizeof
(
E
DataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSize
();
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -168,14 +177,14 @@ int main(int argc, char* argv[])
...
@@ -168,14 +177,14 @@ int main(int argc, char* argv[])
}
}
}
}
for
(
std
::
size_t
i
=
0
;
i
<
gemm_
shape
s
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_
desc
s
.
size
();
i
++
)
{
{
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
b_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
b_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
C
DataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
sizeof
(
E
DataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
].
mData
.
data
());
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
].
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
].
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
].
mData
.
data
());
...
@@ -187,14 +196,16 @@ int main(int argc, char* argv[])
...
@@ -187,14 +196,16 @@ int main(int argc, char* argv[])
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
c_element_op
=
C
DE
ElementOp
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
std
::
vector
<
std
::
array
<
const
void
*
,
0
>>
p_Ds
=
{};
// do GEMM
// do GEMM
auto
argument
=
auto
argument
=
gemm
.
MakeArgument
(
gemm
.
MakeArgument
(
p_a
,
p_b
,
p_c
,
gemm_
shape
s
,
a_element_op
,
b_element_op
,
c_element_op
);
p_a
,
p_b
,
p_Ds
,
p_c
,
gemm_
desc
s
,
a_element_op
,
b_element_op
,
c_element_op
);
DeviceMem
gemm_desc_workspace
(
gemm
.
GetWorkSpaceSize
(
&
argument
));
DeviceMem
gemm_desc_workspace
(
gemm
.
GetWorkSpaceSize
(
&
argument
));
...
@@ -219,7 +230,7 @@ int main(int argc, char* argv[])
...
@@ -219,7 +230,7 @@ int main(int argc, char* argv[])
bool
pass
=
true
;
bool
pass
=
true
;
if
(
do_verification
)
if
(
do_verification
)
{
{
for
(
std
::
size_t
i
=
0
;
i
<
gemm_
shape
s
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_
desc
s
.
size
();
i
++
)
{
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
...
...
example/24_batched_gemm_e_permute/batched_gemm_e_permute_xdl_fp16.cpp
View file @
8a370fbb
...
@@ -26,31 +26,29 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
...
@@ -26,31 +26,29 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
ck
::
half_t
;
using
ADataType
=
F16
;
using
BDataType
=
ck
::
half_t
;
using
BDataType
=
F16
;
using
AccDataType
=
float
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
ck
::
half_t
;
using
CShuffleDataType
=
F16
;
using
EDataType
=
ck
::
half_t
;
using
EDataType
=
F16
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ALayout
=
Row
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
BLayout
=
Col
;
using
ELayout
=
Row
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CDEElementOp
=
PassThrough
;
// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
static
constexpr
auto
MNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmEPermuteXdl
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmEPermuteXdl
//######| ALayout| BLayout| AData| BData| AccData| CShuffle| EData| A| B| C| GEMM| Num| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// clang-format off
//######|
|
|
Type| Type| Typ
e| Data|
Type| Elementwise| Elementwise| Elementwise|Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL
|
Per|
Per|
ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVecto
r|
//######|
ALayout| BLayout| ELayout| AData| BData|
AccData| CShuffl
e|
E
Data|
A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1
|
M
Per|
N
Per|
MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfe
r|
//######| | | | |
|
Type|
|
Operation| Operation| Operation| | | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | Per
Vector|
PerVector_K1| | PerShuffle| PerShuffl
e| _
N
Block_
N
Wave
N
PerXdl|
_NWaveNPerXdl
|
//######| | |
|
Type
|
Type|
Type|
DataType| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| Src
Vector
Dim
|
SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWav
e| _
M
Block_
M
Wave
M
PerXdl|
ScalarPerVector
|
//######| | |
|
|
|
|
| |
|
|
|
| |
|
|
|
| |
|
|
|
|
|
| | | | | | | | | | | | | | | |
|
//######| | | |
|
| | | |
Operation| Operation| Operation
| |
Stage
| | |
|
|
|
|
| | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl
|
//
<
Row,
Col, F16, F16,
F32,
F16, F16, PassThrough, PassThrough, PassThrough, MNPadding,
1,
256, 256, 128,
32,
8,
8, 32, 32,
4,
2,
S<4, 64, 1>,
S<1, 0, 2>, S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2,
8,
8,
true,
1, 1, S<1, 32, 1, 8>, 8>;
//
######|
|
|
|
| |
|
|
|
|
|
|
|
|
|
|
| | | |
|
|
|
|
|
|
| | | | | | | | | | | | | | | |
<
Row
,
Col
,
F16
,
F16
,
F32
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
MNKPadding
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
<
ALayout
,
BLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
// clang-format on
// clang-format on
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
...
@@ -62,15 +60,18 @@ int main(int argc, char* argv[])
...
@@ -62,15 +60,18 @@ int main(int argc, char* argv[])
int
init_method
=
1
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
const
int
M
=
88
;
const
int
M
=
256
;
const
int
N
=
64
;
const
int
N
=
128
;
const
int
K
=
88
;
const
int
K
=
64
;
const
int
stride_A
=
K
;
const
int
stride_A
=
K
;
const
int
stride_B
=
K
;
const
int
stride_B
=
K
;
const
int
G0
=
1024
;
const
int
batch_stride_A
=
M
*
K
;
const
int
G1
=
10
;
const
int
batch_stride_B
=
K
*
N
;
const
int
G0
=
16
;
const
int
G1
=
8
;
const
int
batch_count
=
G0
*
G1
;
const
int
batch_count
=
G0
*
G1
;
...
@@ -102,21 +103,24 @@ int main(int argc, char* argv[])
...
@@ -102,21 +103,24 @@ int main(int argc, char* argv[])
std
::
size_t
row
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
col
,
std
::
size_t
stride
,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
std
::
vector
<
std
::
size_t
>
({
batch_
stride
,
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
std
::
vector
<
std
::
size_t
>
({
batch_
stride
,
1
,
stride
}));
}
}
};
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
batch_count
,
M
,
K
,
stride_A
,
ALayout
{}));
Tensor
<
ADataType
>
a_g_m_k
(
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
batch_count
,
K
,
N
,
stride_B
,
BLayout
{}));
f_host_tensor_descriptor
(
batch_count
,
M
,
K
,
stride_A
,
batch_stride_A
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
batch_count
,
K
,
N
,
stride_B
,
batch_stride_B
,
BLayout
{}));
auto
f_host_e_tensor_descriptor
=
[](
std
::
size_t
G0_
,
auto
f_host_e_tensor_descriptor
=
[](
std
::
size_t
G0_
,
std
::
size_t
G1_
,
std
::
size_t
G1_
,
...
@@ -169,7 +173,7 @@ int main(int argc, char* argv[])
...
@@ -169,7 +173,7 @@ int main(int argc, char* argv[])
auto
gemm
=
DeviceGemmInstance
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
// do GEM
M
// do GEM
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
EDataType
*>
(
e_device_buf
.
GetDeviceBuffer
()),
static_cast
<
EDataType
*>
(
e_device_buf
.
GetDeviceBuffer
()),
...
@@ -178,11 +182,13 @@ int main(int argc, char* argv[])
...
@@ -178,11 +182,13 @@ int main(int argc, char* argv[])
K
,
K
,
stride_A
,
stride_A
,
stride_B
,
stride_B
,
batch_stride_A
,
batch_stride_B
,
batched_gemm_e_permute_desc
,
batched_gemm_e_permute_desc
,
batch_count
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
cde_element_op
,
cde_element_op
);
batch_count
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
{
...
...
example/28_grouped_gemm_bias/CMakeLists.txt
0 → 100644
View file @
8a370fbb
add_example_executable
(
example_grouped_gemm_bias_xdl_fp16 grouped_gemm_bias_xdl_fp16.cpp
)
example/28_grouped_gemm_bias/grouped_gemm_bias_xdl_fp16.cpp
0 → 100644
View file @
8a370fbb
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F16
;
using
DDataType
=
F16
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
DLayout
=
Row
;
using
DsLayout
=
ck
::
Tuple
<
DLayout
>
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Add
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemm_Xdl
// clang-format off
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
// clang-format on
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=n0, 1=yes)
\n
"
);
exit
(
0
);
}
int
group_count
=
rand
()
%
16
+
1
;
// GEMM shape
std
::
vector
<
ck
::
tensor_operation
::
device
::
GemmDesc
>
gemm_descs
;
std
::
vector
<
const
void
*>
p_a
,
p_b
;
std
::
vector
<
std
::
array
<
const
void
*
,
1
>>
p_ds
;
std
::
vector
<
void
*>
p_c
;
gemm_descs
.
reserve
(
group_count
);
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
{
int
M
=
256
+
256
*
i
;
int
N
=
128
+
128
*
i
;
int
K
=
64
+
64
*
i
;
int
stride_A
=
K
;
int
stride_B
=
K
;
int
stride_C
=
N
;
std
::
vector
<
ck
::
index_t
>
stride_Ds
=
{
0
};
gemm_descs
.
push_back
({
M
,
N
,
K
,
stride_A
,
stride_B
,
stride_C
,
stride_Ds
});
}
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
std
::
vector
<
Tensor
<
BDataType
>>
b_tensors
;
std
::
vector
<
Tensor
<
DDataType
>>
d_tensors
;
std
::
vector
<
Tensor
<
EDataType
>>
e_host_tensors
;
std
::
vector
<
Tensor
<
EDataType
>>
e_device_tensors
;
a_tensors
.
reserve
(
group_count
);
b_tensors
.
reserve
(
group_count
);
d_tensors
.
reserve
(
group_count
);
e_host_tensors
.
reserve
(
group_count
);
e_device_tensors
.
reserve
(
group_count
);
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_tensors_device
,
b_tensors_device
,
d_tensors_device
,
e_tensors_device
;
a_tensors_device
.
reserve
(
group_count
);
b_tensors_device
.
reserve
(
group_count
);
d_tensors_device
.
reserve
(
group_count
);
e_tensors_device
.
reserve
(
group_count
);
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
].
M_
,
gemm_descs
[
i
].
K_
,
gemm_descs
[
i
].
stride_A_
,
ALayout
{})));
b_tensors
.
push_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
].
K_
,
gemm_descs
[
i
].
N_
,
gemm_descs
[
i
].
stride_B_
,
BLayout
{})));
d_tensors
.
push_back
(
Tensor
<
DDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
].
M_
,
gemm_descs
[
i
].
N_
,
gemm_descs
[
i
].
stride_Ds_
[
0
],
ELayout
{})));
e_host_tensors
.
push_back
(
Tensor
<
EDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
].
M_
,
gemm_descs
[
i
].
N_
,
gemm_descs
[
i
].
stride_C_
,
ELayout
{})));
e_device_tensors
.
push_back
(
Tensor
<
EDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
].
M_
,
gemm_descs
[
i
].
N_
,
gemm_descs
[
i
].
stride_C_
,
ELayout
{})));
std
::
cout
<<
"gemm["
<<
i
<<
"] a_m_k: "
<<
a_tensors
[
i
].
mDesc
<<
" b_k_n: "
<<
b_tensors
[
i
].
mDesc
<<
" c_m_n: "
<<
e_device_tensors
[
i
].
mDesc
<<
std
::
endl
;
flop
+=
std
::
size_t
(
2
)
*
gemm_descs
[
i
].
M_
*
gemm_descs
[
i
].
K_
*
gemm_descs
[
i
].
N_
;
num_btype
+=
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()
+
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSize
()
+
sizeof
(
EDataType
)
*
e_device_tensors
[
i
].
mDesc
.
GetElementSize
();
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
break
;
case
2
:
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
d_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
default:
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_Sequential
<
0
>
{});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
d_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_Sequential
<
0
>
{});
}
}
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
b_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
d_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
DDataType
)
*
d_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
e_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
EDataType
)
*
e_device_tensors
[
i
].
mDesc
.
GetElementSpaceSize
()));
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
].
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
].
mData
.
data
());
d_tensors_device
[
i
]
->
ToDevice
(
d_tensors
[
i
].
mData
.
data
());
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b
.
push_back
(
b_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_ds
.
push_back
({
d_tensors_device
[
i
]
->
GetDeviceBuffer
()});
p_c
.
push_back
(
e_tensors_device
[
i
]
->
GetDeviceBuffer
());
}
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
// do GEMM
auto
argument
=
gemm
.
MakeArgument
(
p_a
,
p_b
,
p_ds
,
p_c
,
gemm_descs
,
a_element_op
,
b_element_op
,
cde_element_op
);
DeviceMem
gemm_desc_workspace
(
gemm
.
GetWorkSpaceSize
(
&
argument
));
gemm
.
SetWorkSpacePointer
(
&
argument
,
gemm_desc_workspace
.
GetDeviceBuffer
());
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
EDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
PassThrough
>
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
e_tensors_device
[
i
]
->
FromDevice
(
e_device_tensors
[
i
].
mData
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_tensors
[
i
],
b_tensors
[
i
],
e_host_tensors
[
i
],
a_element_op
,
b_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
for
(
int
m
=
0
;
m
<
gemm_descs
[
i
].
M_
;
++
m
)
{
for
(
int
n
=
0
;
n
<
gemm_descs
[
i
].
N_
;
++
n
)
{
cde_element_op
(
e_host_tensors
[
i
](
m
,
n
),
e_host_tensors
[
i
](
m
,
n
),
d_tensors
[
i
](
m
,
n
));
}
}
pass
&=
ck
::
utils
::
check_err
(
e_device_tensors
[
i
].
mData
,
e_host_tensors
[
i
].
mData
);
}
}
return
pass
?
0
:
1
;
}
example/29_batched_gemm_multi_d/CMakeLists.txt
0 → 100644
View file @
8a370fbb
add_example_executable
(
example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_xdl_fp16 batched_gemm_bias_xdl_fp16.cpp
)
example/29_batched_gemm_multi_d/batched_gemm_bias_xdl_fp16.cpp
0 → 100644
View file @
8a370fbb
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F16
;
using
DDataType
=
F16
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
DLayout
=
Row
;
using
DsLayout
=
ck
::
Tuple
<
DLayout
>
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Add
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmMultiD_Xdl
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
// clang-format on
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
const
int
M
=
256
*
(
rand
()
%
16
+
1
);
const
int
N
=
128
*
(
rand
()
%
16
+
1
);
const
int
K
=
64
*
(
rand
()
%
16
+
1
);
const
int
stride_A
=
K
;
const
int
stride_B
=
K
;
const
int
stride_D
=
0
;
const
int
stride_E
=
N
;
const
int
batch_stride_A
=
M
*
K
;
const
int
batch_stride_B
=
K
*
N
;
const
int
batch_stride_D
=
N
;
const
int
batch_stride_E
=
M
*
N
;
const
int
batch_count
=
16
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=n0, 1=yes)
\n
"
);
exit
(
0
);
}
// GEMM shape
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count_
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
batch_count
,
M
,
K
,
stride_A
,
batch_stride_A
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
batch_count
,
K
,
N
,
stride_B
,
batch_stride_B
,
BLayout
{}));
Tensor
<
DDataType
>
d_g_m_n
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_D
,
batch_stride_D
,
DLayout
{}));
Tensor
<
EDataType
>
e_g_m_n_device_result
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_E
,
batch_stride_E
,
ELayout
{}));
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d_g_m_n: "
<<
d_g_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"e_g_m_n: "
<<
e_g_m_n_device_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d_g_m_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
break
;
default:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
d_g_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_g_m_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_device_buf
(
sizeof
(
EDataType
)
*
e_g_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_g_m_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
// do GEMM
auto
argument
=
gemm
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
{
d_device_buf
.
GetDeviceBuffer
()},
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
batch_count
,
stride_A
,
stride_B
,
{
stride_D
},
stride_E
,
batch_stride_A
,
batch_stride_B
,
{
batch_stride_D
},
batch_stride_E
,
a_element_op
,
b_element_op
,
cde_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
batch_count
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
batch_count
*
M
*
K
+
sizeof
(
BDataType
)
*
batch_count
*
K
*
N
+
sizeof
(
EDataType
)
*
batch_count
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
e_g_m_n_device_result
.
mData
.
data
());
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
EDataType
,
AElementOp
,
BElementOp
,
PassThrough
>
;
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
Tensor
<
EDataType
>
e_g_m_n_host_result
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_E
,
batch_stride_E
,
ELayout
{}));
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
a_g_m_k
,
b_g_k_n
,
e_g_m_n_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
for
(
int
g
=
0
;
g
<
batch_count
;
g
++
)
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
cde_element_op
(
e_g_m_n_host_result
(
g
,
m
,
n
),
e_g_m_n_host_result
(
g
,
m
,
n
),
d_g_m_n
(
g
,
m
,
n
));
}
}
}
pass
=
ck
::
utils
::
check_err
(
e_g_m_n_host_result
.
mData
,
e_g_m_n_device_result
.
mData
,
"Error: Incorrect results c"
);
}
return
pass
?
0
:
1
;
}
example/29_batched_gemm_multi_d/batched_gemm_xdl_fp16.cpp
0 → 100644
View file @
8a370fbb
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F16
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
DsLayout
=
ck
::
Tuple
<>
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmMultiD_Xdl
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
// clang-format on
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
const
int
M
=
256
*
(
rand
()
%
16
+
1
);
const
int
N
=
128
*
(
rand
()
%
16
+
1
);
const
int
K
=
64
*
(
rand
()
%
16
+
1
);
const
int
stride_A
=
K
;
const
int
stride_B
=
K
;
const
int
stride_C
=
N
;
const
int
batch_stride_A
=
M
*
K
;
const
int
batch_stride_B
=
K
*
N
;
const
int
batch_stride_C
=
M
*
N
;
const
int
batch_count
=
16
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=n0, 1=yes)
\n
"
);
exit
(
0
);
}
// GEMM shape
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count_
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count_
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
batch_count
,
M
,
K
,
stride_A
,
batch_stride_A
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
batch_count
,
K
,
N
,
stride_B
,
batch_stride_B
,
BLayout
{}));
Tensor
<
EDataType
>
e_g_m_n_device_result
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_C
,
batch_stride_C
,
ELayout
{}));
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"e_g_m_n: "
<<
e_g_m_n_device_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
break
;
default:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_device_buf
(
sizeof
(
EDataType
)
*
e_g_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
// do GEMM
auto
argument
=
gemm
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
{},
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
batch_count
,
stride_A
,
stride_B
,
{},
stride_C
,
batch_stride_A
,
batch_stride_B
,
{},
batch_stride_C
,
a_element_op
,
b_element_op
,
cde_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
batch_count
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
batch_count
*
M
*
K
+
sizeof
(
BDataType
)
*
batch_count
*
K
*
N
+
sizeof
(
EDataType
)
*
batch_count
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
e_g_m_n_device_result
.
mData
.
data
());
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
Tensor
<
EDataType
>
e_g_m_n_host_result
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_C
,
batch_stride_C
,
ELayout
{}));
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
a_g_m_k
,
b_g_k_n
,
e_g_m_n_host_result
,
a_element_op
,
b_element_op
,
cde_element_op
);
ref_invoker
.
Run
(
ref_argument
);
pass
=
ck
::
utils
::
check_err
(
e_g_m_n_host_result
.
mData
,
e_g_m_n_device_result
.
mData
,
"Error: Incorrect results c"
);
}
return
pass
?
0
:
1
;
}
example/
28
_group_convnd_fwd_bias_relu/CMakeLists.txt
→
example/
30
_group
ed
_convnd_fwd_bias_relu/CMakeLists.txt
View file @
8a370fbb
File moved
example/
28
_group_convnd_fwd_bias_relu/README.md
→
example/
30
_group
ed
_convnd_fwd_bias_relu/README.md
View file @
8a370fbb
File moved
example/
28
_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
→
example/
30
_group
ed
_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_common.hpp
View file @
8a370fbb
File moved
example/
28
_group_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
→
example/
30
_group
ed
_convnd_fwd_bias_relu/grouped_convnd_fwd_bias_relu_xdl_fp16.cpp
View file @
8a370fbb
File moved
example/CMakeLists.txt
View file @
8a370fbb
...
@@ -42,4 +42,6 @@ add_subdirectory(24_batched_gemm_e_permute)
...
@@ -42,4 +42,6 @@ add_subdirectory(24_batched_gemm_e_permute)
add_subdirectory
(
25_gemm_bias_e_permute
)
add_subdirectory
(
25_gemm_bias_e_permute
)
add_subdirectory
(
26_contraction
)
add_subdirectory
(
26_contraction
)
add_subdirectory
(
27_layernorm
)
add_subdirectory
(
27_layernorm
)
add_subdirectory
(
28_group_convnd_fwd_bias_relu
)
add_subdirectory
(
28_grouped_gemm_bias
)
add_subdirectory
(
29_batched_gemm_multi_d
)
add_subdirectory
(
30_grouped_convnd_fwd_bias_relu
)
include/ck/tensor_operation/gpu/device/device_batched_gemm_c_permute_xdl.hpp
0 → 100644
View file @
8a370fbb
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
View file @
8a370fbb
...
@@ -14,7 +14,13 @@ struct BatchedGemmEPermuteDesc
...
@@ -14,7 +14,13 @@ struct BatchedGemmEPermuteDesc
ck
::
index_t
stride_G0_
,
stride_G1_
,
stride_M_
,
stride_N_
;
ck
::
index_t
stride_G0_
,
stride_G1_
,
stride_M_
,
stride_N_
;
};
};
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
DELayout
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
>
typename
CDEElementwiseOperation
>
struct
DeviceBatchedGemmEPermute
:
public
BaseOperator
struct
DeviceBatchedGemmEPermute
:
public
BaseOperator
...
@@ -28,11 +34,13 @@ struct DeviceBatchedGemmEPermute : public BaseOperator
...
@@ -28,11 +34,13 @@ struct DeviceBatchedGemmEPermute : public BaseOperator
index_t
K
,
index_t
K
,
index_t
stride_A
,
index_t
stride_A
,
index_t
stride_B
,
index_t
stride_B
,
index_t
batch_stride_A
,
index_t
batch_stride_B
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
index_t
BatchCount
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
,
CDEElementwiseOperation
cde_element_op
)
=
0
;
ck
::
index_t
BatchCount
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
View file @
8a370fbb
...
@@ -118,6 +118,7 @@ __global__ void
...
@@ -118,6 +118,7 @@ __global__ void
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
typename
ELayout
,
typename
ADataType
,
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
AccDataType
,
typename
AccDataType
,
...
@@ -157,7 +158,13 @@ template <typename ALayout,
...
@@ -157,7 +158,13 @@ template <typename ALayout,
typename
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CDEBlockTransferScalarPerVector_NPerBlock
,
index_t
CDEBlockTransferScalarPerVector_NPerBlock
,
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
struct
DeviceBatchedGemmEPermuteXdl
:
public
DeviceBatchedGemmEPermute
<
AElementwiseOperation
,
struct
DeviceBatchedGemmEPermuteXdl
:
public
DeviceBatchedGemmEPermute
<
ALayout
,
BLayout
,
ELayout
,
ADataType
,
BDataType
,
EDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CDEElementwiseOperation
>
CDEElementwiseOperation
>
{
{
...
@@ -389,11 +396,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
...
@@ -389,11 +396,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
index_t
K
,
index_t
K
,
index_t
stride_A
,
index_t
stride_A
,
index_t
stride_B
,
index_t
stride_B
,
index_t
batch_stride_A
,
index_t
batch_stride_B
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
index_t
BatchCount
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
,
CDEElementwiseOperation
cde_element_op
)
index_t
BatchCount
)
:
p_a_grid_
{
p_a_grid
},
:
p_a_grid_
{
p_a_grid
},
p_b_grid_
{
p_b_grid
},
p_b_grid_
{
p_b_grid
},
p_e_grid_
{
p_e_grid
},
p_e_grid_
{
p_e_grid
},
...
@@ -419,10 +428,7 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
...
@@ -419,10 +428,7 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
batched_gemm_e_permute_desc
.
stride_G1_
,
batched_gemm_e_permute_desc
.
stride_G1_
,
batched_gemm_e_permute_desc
.
stride_M_
,
batched_gemm_e_permute_desc
.
stride_M_
,
batched_gemm_e_permute_desc
.
stride_N_
)},
batched_gemm_e_permute_desc
.
stride_N_
)},
compute_ptr_offset_of_batch_
{
compute_ptr_offset_of_batch_
{
batch_stride_A
,
batch_stride_B
,
e_grid_desc_g0_g1_m_n_
},
type_convert
<
index_t
>
(
a_grid_desc_ak0_m_ak1_
.
GetElementSpaceSize
()),
type_convert
<
index_t
>
(
b_grid_desc_bk0_n_bk1_
.
GetElementSpaceSize
()),
e_grid_desc_g0_g1_m_n_
},
block_2_etile_map_
{
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
e_grid_desc_m_n_
)},
block_2_etile_map_
{
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
e_grid_desc_m_n_
)},
a_element_op_
{
a_element_op
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
b_element_op_
{
b_element_op
},
...
@@ -584,11 +590,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
...
@@ -584,11 +590,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
index_t
K
,
index_t
K
,
index_t
stride_A
,
index_t
stride_A
,
index_t
stride_B
,
index_t
stride_B
,
index_t
batch_stride_A
,
index_t
batch_stride_B
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
index_t
BatchCount
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
,
CDEElementwiseOperation
cde_element_op
)
index_t
BatchCount
)
{
{
return
Argument
{
p_a
,
return
Argument
{
p_a
,
p_b
,
p_b
,
...
@@ -598,11 +606,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
...
@@ -598,11 +606,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
K
,
K
,
stride_A
,
stride_A
,
stride_B
,
stride_B
,
batch_stride_A
,
batch_stride_B
,
batched_gemm_e_permute_desc
,
batched_gemm_e_permute_desc
,
BatchCount
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
cde_element_op
,
cde_element_op
};
BatchCount
};
}
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
...
@@ -617,11 +627,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
...
@@ -617,11 +627,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
index_t
K
,
index_t
K
,
index_t
stride_A
,
index_t
stride_A
,
index_t
stride_B
,
index_t
stride_B
,
index_t
batch_stride_A
,
index_t
batch_stride_B
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
BatchedGemmEPermuteDesc
batched_gemm_e_permute_desc
,
index_t
BatchCount
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
,
CDEElementwiseOperation
cde_element_op
)
override
index_t
BatchCount
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
@@ -631,11 +643,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
...
@@ -631,11 +643,13 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<AElementw
K
,
K
,
stride_A
,
stride_A
,
stride_B
,
stride_B
,
batch_stride_A
,
batch_stride_B
,
batched_gemm_e_permute_desc
,
batched_gemm_e_permute_desc
,
BatchCount
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
cde_element_op
,
cde_element_op
);
BatchCount
);
}
}
// polymorphic
// polymorphic
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
0 → 100644
View file @
8a370fbb
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
>
struct
DeviceBatchedGemmMultiD
:
public
BaseOperator
{
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
static_assert
(
DsLayout
::
Size
()
==
DsDataType
::
Size
(),
"wrong! inconsisiten NumDTensor"
);
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
Batch
,
index_t
StrideA
,
index_t
StrideB
,
const
std
::
array
<
ck
::
index_t
,
NumDTensor
>&
StrideDs
,
index_t
StrideE
,
index_t
BatchStrideA
,
index_t
BatchStrideB
,
const
std
::
array
<
ck
::
index_t
,
NumDTensor
>&
BatchStrideDs
,
index_t
BatchStrideE
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
0 → 100644
View file @
8a370fbb
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/device_gemm.hpp
View file @
8a370fbb
...
@@ -12,12 +12,6 @@ namespace ck {
...
@@ -12,12 +12,6 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
struct
GemmShape
{
ck
::
index_t
M
,
N
,
K
;
ck
::
index_t
StrideA
,
StrideB
,
StrideC
;
};
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
typename
CLayout
,
typename
CLayout
,
...
@@ -46,29 +40,6 @@ struct DeviceGemm : public BaseOperator
...
@@ -46,29 +40,6 @@ struct DeviceGemm : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
DeviceGroupedGemm
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
std
::
vector
<
const
void
*>&
p_a
,
std
::
vector
<
const
void
*>&
p_b
,
std
::
vector
<
void
*>&
p_c
,
std
::
vector
<
GemmShape
>&
gemm_shapes
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
ck
::
index_t
KBatch
=
1
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
using
DeviceGroupedGemmPtr
=
std
::
unique_ptr
<
DeviceGroupedGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
View file @
8a370fbb
...
@@ -349,13 +349,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
...
@@ -349,13 +349,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
e_grid_desc_m_n_
,
e_grid_desc_m_n_
,
block_2_etile_map_
))
block_2_etile_map_
))
{
{
e_grid_desc_mblock_mperblock_nblock_nperblock_
=
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
e_grid_desc_m_n_
);
ds_grid_desc_mblock_mperblock_nblock_nperblock_
=
ds_grid_desc_mblock_mperblock_nblock_nperblock_
=
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
ds_grid_desc_m_n_
);
ds_grid_desc_m_n_
);
e_grid_desc_mblock_mperblock_nblock_nperblock_
=
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
e_grid_desc_m_n_
);
}
}
}
}
...
@@ -411,8 +411,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
...
@@ -411,8 +411,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
arg
.
e_grid_desc_m_n_
,
arg
.
e_grid_desc_m_n_
,
arg
.
block_2_etile_map_
))
arg
.
block_2_etile_map_
))
{
{
throw
std
::
runtime_error
(
throw
std
::
runtime_error
(
"wrong! GridwiseGemm has invalid setting"
);
"wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting"
);
}
}
const
index_t
grid_size
=
const
index_t
grid_size
=
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment