Commit 86f8ac01 authored by Jakub Piasecki's avatar Jakub Piasecki
Browse files

Merge remote-tracking branch 'origin/develop' into jakpias/pool1d_fwd

parents 3f6360d0 ab60b390
...@@ -189,7 +189,9 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) ...@@ -189,7 +189,9 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
message("Adding the enable-post-misched=0 compiler flag") message("Adding the enable-post-misched=0 compiler flag")
add_compile_options("SHELL: -mllvm -enable-post-misched=0") add_compile_options("SHELL: -mllvm -enable-post-misched=0")
endif() endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000) set(check-coerce)
check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
message("Adding the amdgpu-coerce-illegal-types=1") message("Adding the amdgpu-coerce-illegal-types=1")
add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1") add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
endif() endif()
...@@ -539,6 +541,9 @@ if(NOT DEFINED INSTANCES_ONLY) ...@@ -539,6 +541,9 @@ if(NOT DEFINED INSTANCES_ONLY)
PACKAGE_NAME examples PACKAGE_NAME examples
) )
add_subdirectory(example) add_subdirectory(example)
if(GPU_TARGETS MATCHES "gfx9" AND NOT INSTANCES_ONLY)
add_subdirectory(codegen)
endif()
if(BUILD_TESTING) if(BUILD_TESTING)
add_subdirectory(test) add_subdirectory(test)
endif() endif()
......
FROM ubuntu:20.04 FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=6.1 ARG ROCMVERSION=6.2
ARG compiler_version="" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
ARG CK_SCCACHE="" ARG CK_SCCACHE=""
...@@ -17,17 +17,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ...@@ -17,17 +17,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN if [ "$ROCMVERSION" != "6.2" ]; then \ RUN if [ "$ROCMVERSION" != "6.3" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/focal/amdgpu-install_6.1.60100-1_all.deb --no-check-certificate" && \ sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.1.60100-1_all.deb && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc4" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-63 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=2009461; \
fi fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
...@@ -64,6 +59,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -64,6 +59,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
python3-dev \ python3-dev \
python3-pip \ python3-pip \
redis \ redis \
rocm-llvm-dev \
sshpass \ sshpass \
stunnel \ stunnel \
software-properties-common \ software-properties-common \
......
...@@ -38,7 +38,7 @@ def getDockerImageName(){ ...@@ -38,7 +38,7 @@ def getDockerImageName(){
img = "${params.USE_CUSTOM_DOCKER}" img = "${params.USE_CUSTOM_DOCKER}"
} }
else{ else{
if (params.ROCMVERSION != "6.2"){ if (params.ROCMVERSION != "6.3"){
if (params.COMPILER_VERSION == "") { if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
} }
...@@ -204,6 +204,9 @@ def cmake_build(Map conf=[:]){ ...@@ -204,6 +204,9 @@ def cmake_build(Map conf=[:]){
cd build cd build
""" """
def invocation_tag="" def invocation_tag=""
if (setup_args.contains("gfx12")){
invocation_tag="gfx12"
}
if (setup_args.contains("gfx11")){ if (setup_args.contains("gfx11")){
invocation_tag="gfx11" invocation_tag="gfx11"
} }
...@@ -285,6 +288,19 @@ def cmake_build(Map conf=[:]){ ...@@ -285,6 +288,19 @@ def cmake_build(Map conf=[:]){
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) { if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
} }
if (params.RUN_CK_TILE_TESTS){
try{
archiveArtifacts "perf_fmha_fwd_*.log"
archiveArtifacts "perf_fmha_bwd_*.log"
stash name: "perf_fmha_fwd_gfx942.log"
stash name: "perf_fmha_bwd_gfx942.log"
stash name: "perf_fmha_fwd_gfx90a.log"
stash name: "perf_fmha_bwd_gfx90a.log"
}
catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
}
}
} }
def buildHipClangJob(Map conf=[:]){ def buildHipClangJob(Map conf=[:]){
...@@ -518,7 +534,7 @@ def Build_CK(Map conf=[:]){ ...@@ -518,7 +534,7 @@ def Build_CK(Map conf=[:]){
//check whether to run performance tests on this node //check whether to run performance tests on this node
def do_perf_tests = 0 def do_perf_tests = 0
sh 'rocminfo | tee rocminfo.log' sh 'rocminfo | tee rocminfo.log'
if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
do_perf_tests = 1 do_perf_tests = 1
echo "Stash profiler and run performance tests" echo "Stash profiler and run performance tests"
} }
...@@ -612,6 +628,17 @@ def process_results(Map conf=[:]){ ...@@ -612,6 +628,17 @@ def process_results(Map conf=[:]){
timeout(time: 1, unit: 'HOURS'){ timeout(time: 1, unit: 'HOURS'){
try{ try{
dir("script"){ dir("script"){
if (params.RUN_CK_TILE_TESTS){
try{
unstash "perf_fmha_fwd_gfx942.log"
unstash "perf_fmha_bwd_gfx942.log"
unstash "perf_fmha_fwd_gfx90a.log"
unstash "perf_fmha_bwd_gfx90a.log"
}
catch(Exception err){
echo "could not locate the FMHA performance logs: ${err.getMessage()}."
}
}
if (params.RUN_FULL_QA){ if (params.RUN_FULL_QA){
// unstash perf files to master // unstash perf files to master
unstash "ckprofiler_0.2.0_amd64.deb" unstash "ckprofiler_0.2.0_amd64.deb"
...@@ -652,10 +679,10 @@ def process_results(Map conf=[:]){ ...@@ -652,10 +679,10 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1; RUN_CK_TILE_TESTS=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : "" 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : ""
pipeline { pipeline {
...@@ -677,8 +704,8 @@ pipeline { ...@@ -677,8 +704,8 @@ pipeline {
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '6.1', defaultValue: '6.2',
description: 'Specify which ROCM version to use: 6.1 (default).') description: 'Specify which ROCM version to use: 6.2 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: '', defaultValue: '',
...@@ -719,10 +746,6 @@ pipeline { ...@@ -719,10 +746,6 @@ pipeline {
name: "RUN_PERFORMANCE_TESTS", name: "RUN_PERFORMANCE_TESTS",
defaultValue: true, defaultValue: true,
description: "Run the performance tests (default: ON)") description: "Run the performance tests (default: ON)")
booleanParam(
name: "RUN_CODEGEN_TESTS",
defaultValue: true,
description: "Run the codegen tests (default: ON)")
booleanParam( booleanParam(
name: "RUN_CK_TILE_TESTS", name: "RUN_CK_TILE_TESTS",
defaultValue: false, defaultValue: false,
...@@ -731,6 +754,11 @@ pipeline { ...@@ -731,6 +754,11 @@ pipeline {
name: "BUILD_INSTANCES_ONLY", name: "BUILD_INSTANCES_ONLY",
defaultValue: false, defaultValue: false,
description: "Test building instances for various architectures simultaneously (default: OFF)") description: "Test building instances for various architectures simultaneously (default: OFF)")
booleanParam(
name: "BUILD_GFX12",
defaultValue: false,
description: "Build CK and run tests on gfx12 (default: OFF)")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -809,33 +837,6 @@ pipeline { ...@@ -809,33 +837,6 @@ pipeline {
} }
} }
} }
stage("Run Codegen Tests")
{
parallel
{
stage("Run Codegen Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CODEGEN_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ cd ../codegen && rm -rf build && mkdir build && cd build && \
cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx90a" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j check"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run CK_TILE Tests") stage("Run CK_TILE Tests")
{ {
parallel parallel
...@@ -852,8 +853,7 @@ pipeline { ...@@ -852,8 +853,7 @@ pipeline {
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ && cd ../ &&
example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
...@@ -872,8 +872,7 @@ pipeline { ...@@ -872,8 +872,7 @@ pipeline {
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ && cd ../ &&
example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
...@@ -1008,6 +1007,26 @@ pipeline { ...@@ -1008,6 +1007,26 @@ pipeline {
cleanWs() cleanWs()
} }
} }
stage("Build CK and run Tests on gfx1201")
{
when {
beforeAgent true
expression { params.BUILD_GFX12.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
}
agent{ label rocmnode("gfx1201") }
environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-DGPU_TARGETS="gfx1201" \
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
}
}
} }
} }
......
cmake_minimum_required(VERSION 3.16)
project(composable_kernel_host LANGUAGES CXX HIP)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
...@@ -8,17 +5,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) ...@@ -8,17 +5,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
find_package(ROCM)
include(ROCMInstallTargets)
include(ROCMTest)
add_compile_options(-std=c++17) add_compile_options(-std=c++17)
find_package(hip) find_package(hip)
## HIP add_custom_target(codegen)
set(CMAKE_HIP_PLATFORM amd)
set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
set(CMAKE_HIP_EXTENSIONS ON)
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
# add include directories # add include directories
include_directories(BEFORE include_directories(BEFORE
...@@ -32,8 +21,9 @@ list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake) ...@@ -32,8 +21,9 @@ list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
include(Embed) include(Embed)
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${CK_ROOT}/include/ck/*.hpp) ${CK_ROOT}/include/ck/*.hpp)
message(STATUS "KERNEL_FILES: ${KERNEL_FILES}") #printouts fot debug purposes
message(STATUS "RELATIVE: ${CK_ROOT}/include") #message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
#message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include) add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp) file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
......
...@@ -76,8 +76,11 @@ std::string SequenceStr(const std::vector<int>& v); ...@@ -76,8 +76,11 @@ std::string SequenceStr(const std::vector<int>& v);
std::string MakeTuple(const std::vector<std::string>& v); std::string MakeTuple(const std::vector<std::string>& v);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wglobal-constructors"
template <int... xs> template <int... xs>
const std::string S = SequenceStr({xs...}); const std::string S = SequenceStr({xs...});
#pragma clang diagnostic pop
constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough"; constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough";
constexpr const char* Bilinear = "ck::tensor_operation::element_wise::Bilinear"; constexpr const char* Bilinear = "ck::tensor_operation::element_wise::Bilinear";
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "ck/host/device_gemm_multiple_d/operation.hpp" #include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/stringutils.hpp" #include "ck/host/stringutils.hpp"
#include "ck/host/types.hpp"
#include "ck/host/utils.hpp" #include "ck/host/utils.hpp"
#include <cassert> #include <cassert>
...@@ -32,11 +33,11 @@ static std::string GetGemmSpec(const std::size_t m, ...@@ -32,11 +33,11 @@ static std::string GetGemmSpec(const std::size_t m,
} }
// function to update prologue/epilogue with user provided operation // function to update prologue/epilogue with user provided operation
void Operation_Xdl_CShuffle::update_prologue(const std::string& prologue) void Operation_Xdl_CShuffle::update_prologue(const std::string& pro)
{ {
if(!prologue.empty()) if(!pro.empty())
{ {
this->prologue = prologue; this->prologue = pro;
this->cde_elem_op = "CDEElementOp"; this->cde_elem_op = "CDEElementOp";
} }
else else
...@@ -45,11 +46,11 @@ void Operation_Xdl_CShuffle::update_prologue(const std::string& prologue) ...@@ -45,11 +46,11 @@ void Operation_Xdl_CShuffle::update_prologue(const std::string& prologue)
} }
} }
void Operation_Xdl_CShuffle::update_epilogue(const std::string& epilogue) void Operation_Xdl_CShuffle::update_epilogue(const std::string& epi)
{ {
if(!epilogue.empty()) if(!epi.empty())
{ {
this->epilogue = epilogue; this->epilogue = epi;
this->cde_elem_op = "CDEElementOp"; this->cde_elem_op = "CDEElementOp";
} }
else else
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include <iostream> #include <iostream>
#include "ck/host/stringutils.hpp" #include "ck/host/stringutils.hpp"
#include "ck/host/types.hpp"
#include "ck/host/utils.hpp" #include "ck/host/utils.hpp"
#include <cassert> #include <cassert>
...@@ -11,34 +12,15 @@ namespace ck { ...@@ -11,34 +12,15 @@ namespace ck {
namespace host { namespace host {
namespace conv { namespace conv {
// calculate appropriate Gemm Specification based on input tensor dimensions // NOTE: in CK, MNKPadding is always used for forward convolution, so didn't
// NOTE: in CK, MNKPadding is always used for forward convolution // add GemmSpec function here
static std::string GetGemmSpec(const std::size_t m,
const std::size_t n,
const std::size_t k,
const std::size_t m_per_block,
const std::size_t n_per_block,
const std::size_t k_per_block)
{
std::string spec = "";
if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
spec += "M";
if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
spec += "N";
if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
spec += "K";
if(spec == "")
return "ck::tensor_operation::device::GemmSpecialization::Default";
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
}
// function to update prologue/epilogue with user provided operation // function to update prologue/epilogue with user provided operation
void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& prologue) void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& pro)
{ {
if(!prologue.empty()) if(!pro.empty())
{ {
this->prologue = prologue; this->prologue = pro;
this->cde_elem_op = "CDEElementOp"; this->cde_elem_op = "CDEElementOp";
} }
else else
...@@ -47,11 +29,11 @@ void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& prologu ...@@ -47,11 +29,11 @@ void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& prologu
} }
} }
void Operation_Conv_Fwd_Xdl_Cshuffle::update_epilogue(const std::string& epilogue) void Operation_Conv_Fwd_Xdl_Cshuffle::update_epilogue(const std::string& epi)
{ {
if(!epilogue.empty()) if(!epi.empty())
{ {
this->epilogue = epilogue; this->epilogue = epi;
this->cde_elem_op = "CDEElementOp"; this->cde_elem_op = "CDEElementOp";
} }
else else
......
...@@ -4,7 +4,10 @@ ...@@ -4,7 +4,10 @@
namespace ck { namespace ck {
namespace host { namespace host {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wglobal-constructors"
const std::string config_header = ""; const std::string config_header = "";
#pragma clang diagnostic pop
std::unordered_map<std::string_view, std::string_view> GetHeaders() std::unordered_map<std::string_view, std::string_view> GetHeaders()
{ {
......
...@@ -4,7 +4,9 @@ file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp) ...@@ -4,7 +4,9 @@ file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
foreach(TEST_SRC ${TEST_SRCS}) foreach(TEST_SRC ${TEST_SRCS})
set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP) set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC}) add_executable(test_host_${BASE_NAME} ${TEST_SRC})
add_dependencies(codegen test_host_${BASE_NAME})
add_test(NAME codegen_test_${BASE_NAME} COMMAND test_host_${BASE_NAME})
target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host) target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
# target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a) # target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
target_include_directories(test_host_${BASE_NAME} PUBLIC include()) target_include_directories(test_host_${BASE_NAME} PUBLIC include())
......
...@@ -92,7 +92,6 @@ struct Epilogue ...@@ -92,7 +92,6 @@ struct Epilogue
static_cast<int>(prob.C), static_cast<int>(prob.C),
static_cast<int>(prob.Y), static_cast<int>(prob.Y),
static_cast<int>(prob.X)}; static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> d_lengths = {};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C), ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C), static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
...@@ -109,7 +108,6 @@ struct Epilogue ...@@ -109,7 +108,6 @@ struct Epilogue
1, 1,
static_cast<int>(prob.X * prob.C), static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)}; static_cast<int>(prob.C)};
ck::Array<ck::index_t, 5> d_strides = {};
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2}; ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1}; ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
......
...@@ -92,7 +92,6 @@ struct Epilogue ...@@ -92,7 +92,6 @@ struct Epilogue
static_cast<int>(prob.C), static_cast<int>(prob.C),
static_cast<int>(prob.Y), static_cast<int>(prob.Y),
static_cast<int>(prob.X)}; static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> d_lengths = {};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C), ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C), static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
...@@ -109,7 +108,6 @@ struct Epilogue ...@@ -109,7 +108,6 @@ struct Epilogue
1, 1,
static_cast<int>(prob.X * prob.C), static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)}; static_cast<int>(prob.C)};
ck::Array<ck::index_t, 5> d_strides = {};
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1}; ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1}; ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
......
...@@ -92,7 +92,6 @@ struct Epilogue ...@@ -92,7 +92,6 @@ struct Epilogue
static_cast<int>(prob.C), static_cast<int>(prob.C),
static_cast<int>(prob.Y), static_cast<int>(prob.Y),
static_cast<int>(prob.X)}; static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> d_lengths = {};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C), ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C), static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
...@@ -109,7 +108,6 @@ struct Epilogue ...@@ -109,7 +108,6 @@ struct Epilogue
1, 1,
static_cast<int>(prob.X * prob.C), static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)}; static_cast<int>(prob.C)};
ck::Array<ck::index_t, 5> d_strides = {};
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2}; ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1}; ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
......
...@@ -92,7 +92,6 @@ struct Epilogue ...@@ -92,7 +92,6 @@ struct Epilogue
static_cast<int>(prob.C), static_cast<int>(prob.C),
static_cast<int>(prob.Y), static_cast<int>(prob.Y),
static_cast<int>(prob.X)}; static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> d_lengths = {};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C), ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C), static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
...@@ -109,7 +108,6 @@ struct Epilogue ...@@ -109,7 +108,6 @@ struct Epilogue
1, 1,
static_cast<int>(prob.X * prob.C), static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)}; static_cast<int>(prob.C)};
ck::Array<ck::index_t, 5> d_strides = {};
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1}; ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1}; ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
......
...@@ -118,4 +118,4 @@ void kernel::launch(hipStream_t stream, ...@@ -118,4 +118,4 @@ void kernel::launch(hipStream_t stream,
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size); launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
} }
} // namespace rtc } // namespace rtc
\ No newline at end of file
...@@ -45,4 +45,4 @@ void tmp_dir::execute(const std::string& cmd) const ...@@ -45,4 +45,4 @@ void tmp_dir::execute(const std::string& cmd) const
tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); } tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); }
} // namespace rtc } // namespace rtc
\ No newline at end of file
rocm-docs-core==1.6.1 rocm-docs-core==1.6.2
sphinxcontrib-bibtex==2.6.2 sphinxcontrib-bibtex==2.6.2
...@@ -103,7 +103,7 @@ requests==2.32.3 ...@@ -103,7 +103,7 @@ requests==2.32.3
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==1.6.1 rocm-docs-core==1.6.2
# via -r requirements.in # via -r requirements.in
six==1.16.0 six==1.16.0
# via pybtex # via pybtex
......
#!/bin/bash
#
# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
#
# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
# input arguments:
# environment tag : a string describing the specifics of your test environment
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# host name : $hostname
# gpu architecture: e.g., gfx90a, or gfx942, etc.
#get the command line arguments:
export env_type=$1
echo 'Environment type: ' $env_type
export branch=$2
echo 'Branch name: ' $branch
export host_name=$3
echo 'Host name: ' $host_name
export GPU_arch=$4
echo 'GPU_arch: ' $GPU_arch
function print_log_header(){
rm -f $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
#run verification tests
example/ck_tile/01_fmha/script/smoke_test_fwd.sh
example/ck_tile/01_fmha/script/smoke_test_bwd.sh
#run performance benchmarks
export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
print_log_header $fmha_fwd_log $env_type $branch $host_name
example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
print_log_header $fmha_bwd_log $env_type $branch $host_name
example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
...@@ -102,9 +102,10 @@ __global__ void ...@@ -102,9 +102,10 @@ __global__ void
// offset base pointer for each work-group // offset base pointer for each work-group
const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y); const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z); const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
const long_index_t e_group_offset =
const long_index_t e_batch_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx)); amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx); const auto& ds_batch_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
const long_index_t e_n_offset = const long_index_t e_n_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
...@@ -117,14 +118,14 @@ __global__ void ...@@ -117,14 +118,14 @@ __global__ void
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
static_for<0, NumDTensor, 1>{}( static_for<0, NumDTensor, 1>{}(
[&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; }); [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
if constexpr(isMultiA || isMultiB) if constexpr(isMultiA || isMultiB)
{ {
AsPointer p_as_grid_grp; AsPointer p_as_grid_grp;
BsPointer p_bs_grid_grp; BsPointer p_bs_grid_grp;
const auto& as_group_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx); const auto& as_batch_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx);
// compute_ptr_offset_of_n_ not need BatchStrideB so // compute_ptr_offset_of_n_ not need BatchStrideB so
// in case of MultiA is false but isMultiB is true // in case of MultiA is false but isMultiB is true
...@@ -135,27 +136,27 @@ __global__ void ...@@ -135,27 +136,27 @@ __global__ void
static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size(); static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size();
static_for<0, NumATensor, 1>{}([&](auto i) { static_for<0, NumATensor, 1>{}([&](auto i) {
p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + as_n_offset[i]; p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i] + as_n_offset[i];
}); });
} }
else else
{ {
const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx); const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
static_for<0, 1, 1>{}( static_for<0, 1, 1>{}(
[&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + a_n_offset; }); [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i] + a_n_offset; });
} }
const auto& bs_group_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx); const auto& bs_batch_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx);
static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size(); static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size();
static_for<0, NumBTensor, 1>{}( static_for<0, NumBTensor, 1>{}(
[&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_group_offset[i]; }); [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_batch_offset[i]; });
GridwiseGemm::template Run<HasMainKBlockLoop>( GridwiseGemm::template Run<HasMainKBlockLoop>(
p_as_grid_grp, p_as_grid_grp,
p_bs_grid_grp, p_bs_grid_grp,
p_ds_grid_grp, p_ds_grid_grp,
p_e_grid + e_group_offset + e_n_offset, p_e_grid + e_batch_offset + e_n_offset,
p_shared, p_shared,
a_element_op, a_element_op,
b_element_op, b_element_op,
...@@ -168,19 +169,19 @@ __global__ void ...@@ -168,19 +169,19 @@ __global__ void
} }
else else
{ {
const long_index_t a_group_offset = const long_index_t a_batch_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx)); amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
const long_index_t b_group_offset = const long_index_t b_batch_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx)); amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
const long_index_t a_n_offset = const long_index_t a_n_offset =
amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
GridwiseGemm::template Run<HasMainKBlockLoop>( GridwiseGemm::template Run<HasMainKBlockLoop>(
p_as_grid + a_group_offset + a_n_offset, p_as_grid + a_batch_offset + a_n_offset,
p_bs_grid + b_group_offset, p_bs_grid + b_batch_offset,
p_ds_grid_grp, p_ds_grid_grp,
p_e_grid + e_group_offset + e_n_offset, p_e_grid + e_batch_offset + e_n_offset,
p_shared, p_shared,
a_element_op, a_element_op,
b_element_op, b_element_op,
...@@ -282,8 +283,7 @@ template <index_t NDimSpatial, ...@@ -282,8 +283,7 @@ template <index_t NDimSpatial,
// in tuple for MultiAB), unpack if tuple was // in tuple for MultiAB), unpack if tuple was
// passed // passed
typename BComputeDataType = AComputeDataType, typename BComputeDataType = AComputeDataType,
LoopScheduler LoopSched = make_default_loop_scheduler(), LoopScheduler LoopSched = make_default_loop_scheduler()>
index_t NumGroupsToMerge = 1>
struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
: public DeviceGroupedConvFwdMultipleABD<NDimSpatial, : public DeviceGroupedConvFwdMultipleABD<NDimSpatial,
ALayout, ALayout,
...@@ -302,8 +302,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -302,8 +302,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
{ {
using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle; using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle;
static_assert(NumGroupsToMerge >= 1);
static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value; static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value; static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
...@@ -319,9 +317,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -319,9 +317,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial, using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial,
ConvForwardSpecialization, ConvForwardSpecialization,
true /*SplitN*/, true /*SplitN*/,
ALayout, ADataType,
ELayout, EDataType>;
NumGroupsToMerge>;
static constexpr auto matrix_padder = static constexpr auto matrix_padder =
MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock}; MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
...@@ -520,8 +517,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -520,8 +517,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
{ {
static_for<0, NumATensor, 1>{}([&](auto i) { static_for<0, NumATensor, 1>{}([&](auto i) {
// Init compute_ptr_offset_of_groups_ for multiple AB // Init compute_ptr_offset_of_groups_ for multiple AB
compute_ptr_offset_of_groups_.BatchStrideA_(i) = compute_ptr_offset_of_groups_.BatchStrideA_(i) = a_g_n_c_wis_strides[0];
a_g_n_c_wis_strides[0] * NumGroupsToMerge;
// Use GemmADataType/GemmBDataType to iterate over tuple (even if passed data // Use GemmADataType/GemmBDataType to iterate over tuple (even if passed data
// type is not tuple) // type is not tuple)
...@@ -549,8 +545,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -549,8 +545,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
}); });
static_for<0, NumBTensor, 1>{}([&](auto i) { static_for<0, NumBTensor, 1>{}([&](auto i) {
// Init compute_ptr_offset_of_groups_ for multiple AB // Init compute_ptr_offset_of_groups_ for multiple AB
compute_ptr_offset_of_groups_.BatchStrideB_(i) = compute_ptr_offset_of_groups_.BatchStrideB_(i) = b_g_k_c_xs_strides[0];
b_g_k_c_xs_strides[0] * NumGroupsToMerge;
using DataType = remove_cvref_t<tuple_element_t<i.value, GemmBDataType>>; using DataType = remove_cvref_t<tuple_element_t<i.value, GemmBDataType>>;
// It is possible that one of the AB is a pointer and one is a tuple. // It is possible that one of the AB is a pointer and one is a tuple.
...@@ -570,10 +565,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -570,10 +565,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
} }
else else
{ {
compute_ptr_offset_of_groups_.BatchStrideA_ = compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0];
a_g_n_c_wis_strides[0] * NumGroupsToMerge; compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0];
compute_ptr_offset_of_groups_.BatchStrideB_ =
b_g_k_c_xs_strides[0] * NumGroupsToMerge;
compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_; compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_;
// p_as and p_bs are pointers // p_as and p_bs are pointers
...@@ -590,8 +583,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -590,8 +583,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]); p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
// D batch stride // D batch stride
compute_ptr_offset_of_groups_.BatchStrideDs_(i) = compute_ptr_offset_of_groups_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
ds_g_n_k_wos_strides[i][0] * NumGroupsToMerge;
compute_ptr_offset_of_n_.BatchStrideDs_(i) = compute_ptr_offset_of_n_.BatchStrideDs_(i) =
ds_g_n_k_wos_strides[i][1] * conv_N_per_block_; ds_g_n_k_wos_strides[i][1] * conv_N_per_block_;
...@@ -610,7 +602,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -610,7 +602,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
ds_grid_desc_m_n_(i) = ds_grid_desc_m_n_(i) =
DeviceOp::MakeEGridDescriptor_M_N<DLayout>(conv_to_gemm_transformer_d); DeviceOp::MakeEGridDescriptor_M_N<DLayout>(conv_to_gemm_transformer_d);
}); });
compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0] * NumGroupsToMerge; compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0];
compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_; compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_;
// populate desc for Ds/E // populate desc for Ds/E
...@@ -734,7 +726,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -734,7 +726,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_; arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;
const index_t gdx = arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_); const index_t gdx = arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
const index_t gdy = arg.num_group_ / NumGroupsToMerge; const index_t gdy = arg.num_group_;
const index_t gdz = num_workgroups_per_Conv_N; const index_t gdz = num_workgroups_per_Conv_N;
const auto K = const auto K =
...@@ -858,10 +850,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -858,10 +850,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
{ {
namespace ctc = tensor_layout::convolution; namespace ctc = tensor_layout::convolution;
const index_t G = arg.b_g_k_c_xs_lengths_[I0];
const index_t K = arg.b_g_k_c_xs_lengths_[I1];
const index_t C = arg.b_g_k_c_xs_lengths_[I2];
// check device // check device
if(get_device_name() == "gfx908") if(get_device_name() == "gfx908")
{ {
...@@ -910,42 +898,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -910,42 +898,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
} }
} }
} }
else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::Filter3x3)
{
if(C != 1)
{
return false;
}
for(index_t i = 0; i < NDimSpatial; ++i)
{
const index_t filter_spatial_dim = arg.b_g_k_c_xs_lengths_[i + I3];
if(filter_spatial_dim != I3)
{
return false;
}
}
if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC<ALayout, BLayout, ELayout>())
{
return false;
}
}
if constexpr(NumGroupsToMerge > 1)
{
if(!(C == 1))
{
return false;
}
if(G % NumGroupsToMerge != 0)
{
return false;
}
if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC<ALayout, BLayout, ELayout>())
{
return false;
}
}
// check vector access of A // check vector access of A
// FIXME: layout // FIXME: layout
...@@ -955,16 +907,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -955,16 +907,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> || is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
is_same_v<ALayout, ctc::NDHWGC>) is_same_v<ALayout, ctc::NDHWGC>)
{ {
// Check access per C const index_t C = arg.a_g_n_c_wis_lengths_[2];
if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0)) if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
{ {
// If not possible, check access per G return false;
if(!(ABlockTransferSrcVectorDim == 1 && C == 1 &&
is_NSpatialGK_GKSpatial_NSpatialGC<ALayout, BLayout, ELayout>() &&
G % ABlockTransferSrcScalarPerVector == 0))
{
return false;
}
} }
} }
else else
...@@ -981,6 +928,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -981,6 +928,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
is_same_v<BLayout, ctc::KZYXGC>) is_same_v<BLayout, ctc::KZYXGC>)
{ {
const index_t C = arg.b_g_k_c_xs_lengths_[2];
if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0)) if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
{ {
return false; return false;
...@@ -1004,6 +953,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -1004,6 +953,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> || is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>) is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
{ {
const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0)) if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
{ {
valid = false; valid = false;
...@@ -1048,6 +999,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -1048,6 +999,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> || is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> ||
is_same_v<ELayout, ctc::NDHWGK>) is_same_v<ELayout, ctc::NDHWGK>)
{ {
const index_t K = arg.e_g_n_k_wos_lengths_[2];
if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0)) if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
{ {
return false; return false;
...@@ -1345,8 +1298,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle ...@@ -1345,8 +1298,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<< BBlockTransferSrcScalarPerVector << ", " << BBlockTransferSrcScalarPerVector << ", "
<< CDEBlockTransferScalarPerVector_NPerBlock << ", " << CDEBlockTransferScalarPerVector_NPerBlock << ", "
<< CShuffleMXdlPerWavePerShuffle << ", " << CShuffleMXdlPerWavePerShuffle << ", "
<< CShuffleNXdlPerWavePerShuffle << ", " << CShuffleNXdlPerWavePerShuffle
<< NumGroupsToMerge
<< ">"; << ">";
// clang-format on // clang-format on
......
...@@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng) ...@@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)
// convert to bitwise // convert to bitwise
using T_bitwise = typename NumericUtils<X>::bitwise_type; using T_bitwise = typename NumericUtils<X>::bitwise_type;
T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x)); T_bitwise x_bitwise = bit_cast<T_bitwise>(x);
// unpack the input, depends on datatype // unpack the input, depends on datatype
head = x_bitwise & NumericUtils<X>::head_mask; head = x_bitwise & NumericUtils<X>::head_mask;
...@@ -196,18 +196,17 @@ __host__ __device__ Y run_cast_from_f8(X x) ...@@ -196,18 +196,17 @@ __host__ __device__ Y run_cast_from_f8(X x)
// prepare the codes // prepare the codes
constexpr X nan_code = 0x80; constexpr X nan_code = 0x80;
Y Inf, NegInf, NaN, Neg0; using T_bitwise = typename NumericUtils<Y>::bitwise_type;
using T_bitwise = typename NumericUtils<Y>::bitwise_type;
constexpr T_bitwise Inf_bitwise = NumericUtils<Y>::Inf; constexpr T_bitwise Inf_bitwise = NumericUtils<Y>::Inf;
constexpr T_bitwise NegInf_bitwise = NumericUtils<Y>::NegInf; constexpr T_bitwise NegInf_bitwise = NumericUtils<Y>::NegInf;
constexpr T_bitwise NaN_bitwise = NumericUtils<Y>::NaN; constexpr T_bitwise NaN_bitwise = NumericUtils<Y>::NaN;
constexpr T_bitwise Neg0_bitwise = NumericUtils<Y>::Neg0; constexpr T_bitwise Neg0_bitwise = NumericUtils<Y>::Neg0;
Inf = *(reinterpret_cast<const Y*>(&Inf_bitwise)); constexpr Y Inf = bit_cast<Y>(Inf_bitwise);
NegInf = *(reinterpret_cast<const Y*>(&NegInf_bitwise)); constexpr Y NegInf = bit_cast<Y>(NegInf_bitwise);
NaN = *(reinterpret_cast<const Y*>(&NaN_bitwise)); constexpr Y NaN = bit_cast<Y>(NaN_bitwise);
Neg0 = *(reinterpret_cast<const Y*>(&Neg0_bitwise)); constexpr Y Neg0 = bit_cast<Y>(Neg0_bitwise);
// check if x is 0.0 // check if x is 0.0
if(x == 0) if(x == 0)
...@@ -240,7 +239,7 @@ __host__ __device__ Y run_cast_from_f8(X x) ...@@ -240,7 +239,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
{ {
retval = x; retval = x;
retval <<= 8; retval <<= 8;
return *(reinterpret_cast<const Y*>(&retval)); return bit_cast<Y>(retval);
} }
// subnormal input // subnormal input
...@@ -264,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x) ...@@ -264,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
} }
retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa; retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa;
return *(reinterpret_cast<const Y*>(&retval)); return bit_cast<Y>(retval);
} }
} // namespace } // namespace
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment