Commit 8820cf9f authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Merge branch 'develop' into feature/integrage-karg-simplification-pr

parents cb46ef7a 4feebedd
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/.sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
FROM ubuntu:20.04 FROM ubuntu:20.04
ARG ROCMVERSION=5.3 ARG ROCMVERSION=5.6
ARG compiler_version="release" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
RUN set -xe RUN set -xe
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera
# Add rocm repository # Add rocm repository
RUN apt-get update RUN apt-get update
RUN apt-get install -y wget gnupg RUN apt-get install -y wget gnupg curl
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
fi
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
# Install dependencies # Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \ apt-utils \
build-essential \ build-essential \
ccache \ ccache \
cmake-data \
cmake \ cmake \
curl \
git \ git \
hip-rocclr \ hip-rocclr \
jq \ jq \
...@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
rocm-device-libs \ rocm-device-libs \
rocm-cmake \ rocm-cmake \
vim \ vim \
nano \
zlib1g-dev \ zlib1g-dev \
openssh-server \ openssh-server \
clang-format-10 \ clang-format-10 \
...@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
#Install latest version of cmake
RUN apt purge --auto-remove -y cmake
RUN apt update
RUN apt install -y software-properties-common lsb-release
RUN apt clean all
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
RUN apt install -y kitware-archive-keyring
RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
RUN apt install -y cmake
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1 ENV UBSAN_OPTIONS=print_stacktrace=1
...@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit ...@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
RUN sh -c "echo compiler version = '$compiler_version'" RUN sh -c "echo compiler version = '$compiler_version'"
RUN sh -c "echo compiler commit = '$compiler_commit'" RUN sh -c "echo compiler commit = '$compiler_commit'"
RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
fi
RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \ cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
...@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com ...@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
......
...@@ -19,12 +19,33 @@ def runShell(String command){ ...@@ -19,12 +19,33 @@ def runShell(String command){
def getDockerImageName(){ def getDockerImageName(){
def img def img
if (params.COMPILER_COMMIT == ""){ if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
}
else{
if (params.COMPILER_COMMIT == ""){
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
}
else{
def commit = "${params.COMPILER_COMMIT}"[0..6]
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
}
}
} }
else{ else{
def commit = "${params.COMPILER_COMMIT}"[0..6] if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
}
else{
if (params.COMPILER_COMMIT == ""){
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
}
else{
def commit = "${params.COMPILER_COMMIT}"[0..6]
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
}
}
} }
return img return img
} }
...@@ -49,11 +70,11 @@ def build_compiler(){ ...@@ -49,11 +70,11 @@ def build_compiler(){
compiler = '/opt/rocm/bin/hipcc' compiler = '/opt/rocm/bin/hipcc'
} }
else{ else{
if (params.COMPILER_VERSION == "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
compiler = "/opt/rocm/llvm/bin/clang++" compiler = "/llvm-project/build/bin/clang++"
} }
else{ else{
compiler = "/llvm-project/build/bin/clang++" compiler = "/opt/rocm/llvm/bin/clang++"
} }
} }
return compiler return compiler
...@@ -232,7 +253,7 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -232,7 +253,7 @@ def buildHipClangJob(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -287,7 +308,7 @@ def runCKProfiler(Map conf=[:]){ ...@@ -287,7 +308,7 @@ def runCKProfiler(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -420,7 +441,7 @@ def Build_CK(Map conf=[:]){ ...@@ -420,7 +441,7 @@ def Build_CK(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -576,7 +597,7 @@ def process_results(Map conf=[:]){ ...@@ -576,7 +597,7 @@ def process_results(Map conf=[:]){
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
0 21 * * * % COMPILER_VERSION=release;COMPILER_COMMIT= 0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
pipeline { pipeline {
...@@ -594,16 +615,16 @@ pipeline { ...@@ -594,16 +615,16 @@ pipeline {
description: "Force building docker image (default: false), set to true if docker image needs to be updated.") description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '5.4.3', defaultValue: '5.6',
description: 'Specify which ROCM version to use: 5.4.3 (default).') description: 'Specify which ROCM version to use: 5.6 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: 'amd-stg-open', defaultValue: '',
description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).') description: 'Specify which version of compiler to use: release, amd-stg-open, or leave blank (default).')
string( string(
name: 'COMPILER_COMMIT', name: 'COMPILER_COMMIT',
defaultValue: '5541927df00eabd6a110180170eca7785d436ee3', defaultValue: '',
description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.') description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
string( string(
name: 'BUILD_COMPILER', name: 'BUILD_COMPILER',
defaultValue: 'hipcc', defaultValue: 'hipcc',
......
add_executable(client_contraction_scale contraction_scale.cpp) add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations) target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_operations)
add_executable(client_contraction_bilinear contraction_bilinear.cpp) add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp)
target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations) target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_operations)
add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp)
target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_operations)
add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp)
target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_operations)
add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp) add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp)
target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations) target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
#include "ck/library/utility/numeric.hpp"
using F64 = double;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Bilinear;
using ADataType = F64;
using BDataType = F64;
using AccDataType = F64;
using CShuffleDataType = F64;
using DDataType = F64;
using DsDataType = ck::Tuple<DDataType>;
using EDataType = F64;
static constexpr ck::index_t NumDimM = 2;
static constexpr ck::index_t NumDimN = 2;
static constexpr ck::index_t NumDimK = 2;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// kknn
#if 1
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// knnn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mknn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mnnn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
#endif
float alpha = 1.f;
float beta = 1.f;
if(argc == 1)
{
// use default case
}
else if(argc == 25)
{
const ck::index_t M0 = std::stoi(argv[1]);
const ck::index_t M1 = std::stoi(argv[2]);
const ck::index_t N0 = std::stoi(argv[3]);
const ck::index_t N1 = std::stoi(argv[4]);
const ck::index_t K0 = std::stoi(argv[5]);
const ck::index_t K1 = std::stoi(argv[6]);
a_ms_ks_lengths = {M0, M1, K0, K1};
a_ms_ks_strides = {
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
b_ns_ks_lengths = {N0, N1, K0, K1};
b_ns_ks_strides = {
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
d_ms_ns_lengths = {M0, M1, N0, N1};
d_ms_ns_strides = {
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
e_ms_ns_lengths = {M0, M1, N0, N1};
e_ms_ns_strides = {
std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
alpha = std::stof(argv[23]);
beta = std::stof(argv[24]);
}
else
{
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
printf("arg23 to 24: alpha, beta\n");
exit(0);
}
auto f_tensor_space_size = [](auto lengths, auto strides) {
std::size_t space_size = 1;
for(std::size_t i = 0; i < lengths.size(); ++i)
{
space_size += (lengths[i] - 1) * strides[i];
}
return space_size;
};
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
SimpleDeviceMem d_device_buf(sizeof(DDataType) *
f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
NumDimM,
NumDimN,
NumDimK,
ADataType,
BDataType,
ck::Tuple<DDataType>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::Bilinear>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{alpha, beta};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
a_ms_ks_lengths,
a_ms_ks_strides,
b_ns_ks_lengths,
b_ns_ks_strides,
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
e_ms_ns_lengths,
e_ms_ns_strides,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
ck::index_t M = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
ck::index_t N = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
ck::index_t K = ck::accumulate_n<ck::index_t>(
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
#include "ck/library/utility/numeric.hpp"
using F64 = double;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Scale = ck::tensor_operation::element_wise::Scale;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Scale;
using ADataType = F64;
using BDataType = F64;
using AccDataType = F64;
using CShuffleDataType = F64;
using DsDataType = ck::Tuple<>;
using EDataType = F64;
static constexpr ck::index_t NumDimM = 2;
static constexpr ck::index_t NumDimN = 2;
static constexpr ck::index_t NumDimK = 2;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// kkn
#if 1
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// knn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mkn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mnn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
#endif
float scale = 1.f;
if(argc == 1)
{
// use default case
}
else if(argc == 20)
{
const ck::index_t M0 = std::stoi(argv[1]);
const ck::index_t M1 = std::stoi(argv[2]);
const ck::index_t N0 = std::stoi(argv[3]);
const ck::index_t N1 = std::stoi(argv[4]);
const ck::index_t K0 = std::stoi(argv[5]);
const ck::index_t K1 = std::stoi(argv[6]);
a_ms_ks_lengths = {M0, M1, K0, K1};
a_ms_ks_strides = {
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
b_ns_ks_lengths = {N0, N1, K0, K1};
b_ns_ks_strides = {
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
e_ms_ns_lengths = {M0, M1, N0, N1};
e_ms_ns_strides = {
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
scale = std::stof(argv[19]);
}
else
{
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
printf("arg19: scale\n");
exit(0);
}
auto f_tensor_space_size = [](auto lengths, auto strides) {
std::size_t space_size = 1;
for(std::size_t i = 0; i < lengths.size(); ++i)
{
space_size += (lengths[i] - 1) * strides[i];
}
return space_size;
};
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
NumDimM,
NumDimN,
NumDimK,
ADataType,
BDataType,
ck::Tuple<>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::Scale>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{scale};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
e_device_buf.GetDeviceBuffer(),
a_ms_ks_lengths,
a_ms_ks_strides,
b_ns_ks_lengths,
b_ns_ks_strides,
std::array<std::vector<ck::index_t>, 0>{},
std::array<std::vector<ck::index_t>, 0>{},
e_ms_ns_lengths,
e_ms_ns_strides,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
ck::index_t M = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
ck::index_t N = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
ck::index_t K = ck::accumulate_n<ck::index_t>(
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return 0;
}
...@@ -17,22 +17,22 @@ using InDataType = ck::half_t; ...@@ -17,22 +17,22 @@ using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 32; static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 256; static constexpr ck::index_t N = 256; // batch size
static constexpr ck::index_t K = 192; static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 192; static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -52,50 +52,24 @@ struct SimpleDeviceMem ...@@ -52,50 +52,24 @@ struct SimpleDeviceMem
int main() int main()
{ {
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C}; // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1}; // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K}; std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
...@@ -155,9 +129,9 @@ int main() ...@@ -155,9 +129,9 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X; std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(WeiDataType) * G * K * Y * X * C + sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * G * N * Ho * Wo * K; sizeof(OutDataType) * N * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
......
...@@ -17,26 +17,26 @@ using BiasDataType = int32_t; ...@@ -17,26 +17,26 @@ using BiasDataType = int32_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::Relu; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; // input W static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; // output H static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; // output W static constexpr ck::index_t Wo = 36; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
SimpleDeviceMem() = delete; SimpleDeviceMem() = delete;
...@@ -55,8 +55,11 @@ struct SimpleDeviceMem ...@@ -55,8 +55,11 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
...@@ -64,17 +67,18 @@ int main(int argc, char* argv[]) ...@@ -64,17 +67,18 @@ int main(int argc, char* argv[])
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial, NumDimSpatial,
...@@ -203,4 +207,4 @@ int main(int argc, char* argv[]) ...@@ -203,4 +207,4 @@ int main(int argc, char* argv[])
} }
return 0; return 0;
} }
\ No newline at end of file
...@@ -16,19 +16,19 @@ using WeiDataType = int8_t; ...@@ -16,19 +16,19 @@ using WeiDataType = int8_t;
using BiasDataType = int32_t; using BiasDataType = int32_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::Relu; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -55,23 +55,27 @@ struct SimpleDeviceMem ...@@ -55,23 +55,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
......
...@@ -17,21 +17,21 @@ using BiasDataType = int32_t; ...@@ -17,21 +17,21 @@ using BiasDataType = int32_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::TanH; using ActivationOp = ck::tensor_operation::element_wise::TanH;
using OutElementOp = using OutElementOp =
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>; ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -58,8 +58,11 @@ struct SimpleDeviceMem ...@@ -58,8 +58,11 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
...@@ -67,17 +70,18 @@ int main(int argc, char* argv[]) ...@@ -67,17 +70,18 @@ int main(int argc, char* argv[])
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial, NumDimSpatial,
...@@ -206,4 +210,4 @@ int main(int argc, char* argv[]) ...@@ -206,4 +210,4 @@ int main(int argc, char* argv[])
} }
return 0; return 0;
} }
\ No newline at end of file
...@@ -16,19 +16,19 @@ using WeiDataType = int8_t; ...@@ -16,19 +16,19 @@ using WeiDataType = int8_t;
using BiasDataType = int32_t; using BiasDataType = int32_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::TanH; using ActivationOp = ck::tensor_operation::element_wise::TanH;
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -56,23 +56,27 @@ struct SimpleDeviceMem ...@@ -56,23 +56,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
......
...@@ -16,25 +16,25 @@ using WeiDataType = int8_t; ...@@ -16,25 +16,25 @@ using WeiDataType = int8_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough; using ActivationOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; // input W static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; // output H static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; // output W static constexpr ck::index_t Wo = 36; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -54,23 +54,27 @@ struct SimpleDeviceMem ...@@ -54,23 +54,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
...@@ -196,4 +200,4 @@ int main(int argc, char* argv[]) ...@@ -196,4 +200,4 @@ int main(int argc, char* argv[])
} }
return 0; return 0;
} }
\ No newline at end of file
...@@ -15,18 +15,18 @@ using InDataType = int8_t; ...@@ -15,18 +15,18 @@ using InDataType = int8_t;
using WeiDataType = int8_t; using WeiDataType = int8_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough; using ActivationOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; // output channel static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 192; // input channel static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H static constexpr ck::index_t Hi = 71; // input H
...@@ -53,20 +53,24 @@ struct SimpleDeviceMem ...@@ -53,20 +53,24 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
......
add_executable(client_groupnorm_swish groupnorm_swish.cpp)
target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
using XDataType = ck::half_t;
using GammaDataType = float;
using BetaDataType = float;
using YDataType = ck::half_t;
using ComputeDataType = float;
using Swish = ck::tensor_operation::element_wise::Swish;
constexpr int Rank = 5;
constexpr int NumReduceDim = 3;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
ck::index_t N = 32;
ck::index_t H = 16;
ck::index_t W = 16;
ck::index_t G = 64;
ck::index_t C = 128;
std::size_t xy_size = N * H * W * G * C;
std::size_t gamma_beta_size = G * C;
std::vector<ck::index_t> xy_strides = {H * W * G * C, W * G * C, G * C, C, 1};
std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType,
BetaDataType,
ComputeDataType,
YDataType,
Swish,
Rank,
NumReduceDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
nullptr,
nullptr,
Swish{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte =
sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
nullptr,
nullptr,
Swish{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
...@@ -92,6 +92,7 @@ else() ...@@ -92,6 +92,7 @@ else()
-Wno-unused-command-line-argument -Wno-unused-command-line-argument
-Wno-weak-vtables -Wno-weak-vtables
-Wno-covered-switch-default -Wno-covered-switch-default
-Wno-unsafe-buffer-usage
) )
else() else()
if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX") if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
......
...@@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS ...@@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-comma -Wno-comma
-Wno-old-style-cast -Wno-old-style-cast
-Wno-deprecated -Wno-deprecated
-Wno-unsafe-buffer-usage
) )
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}") message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
......
git+https://github.com/RadeonOpenCompute/rocm-docs-core.git rocm-docs-core==0.2.0
sphinxcontrib-bibtex==2.5.0 sphinxcontrib-bibtex==2.5.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment