Commit bbe74503 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'develop' into amd-develop

parents 8b76b832 f53ede26
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/.sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
...@@ -48,6 +48,11 @@ build* ...@@ -48,6 +48,11 @@ build*
.gdb_history .gdb_history
install.dir* install.dir*
# directories containing generated documentation # documentation artifacts
docs/source/_build/ build/
docs/docBin/ _build/
_images/
_static/
_templates/
_toc.yml
docBin/
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
sphinx:
configuration: docs/conf.py
formats: [htmlzip]
python:
install:
- requirements: docs/.sphinx/requirements.txt
...@@ -22,6 +22,7 @@ include(TargetFlags) ...@@ -22,6 +22,7 @@ include(TargetFlags)
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
...@@ -29,6 +30,12 @@ if(USE_BITINT_EXTENSION_INT4) ...@@ -29,6 +30,12 @@ if(USE_BITINT_EXTENSION_INT4)
message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}") message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
endif() endif()
if(USE_OPT_NAVI3X)
add_compile_options(-mcumode)
add_compile_options(-mno-wavefrontsize64)
message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}")
endif()
## Threads ## Threads
set(THREADS_PREFER_PTHREAD_FLAG ON) set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
......
FROM ubuntu:20.04 FROM ubuntu:20.04
ARG ROCMVERSION=5.3 ARG ROCMVERSION=5.6
ARG compiler_version="release" ARG compiler_version=""
ARG compiler_commit="" ARG compiler_commit=""
RUN set -xe RUN set -xe
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera
# Add rocm repository # Add rocm repository
RUN apt-get update RUN apt-get update
RUN apt-get install -y wget gnupg RUN apt-get install -y wget gnupg curl
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.6"]; then \
RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"; \
else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \
apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \
amdgpu-repo --amdgpu-build=1567752 --rocm-build=compute-rocm-dkms-no-npi-hipclang/11914 && \
DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \
fi
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
# Install dependencies # Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \ apt-utils \
build-essential \ build-essential \
ccache \ ccache \
cmake-data \
cmake \ cmake \
curl \
git \ git \
hip-rocclr \ hip-rocclr \
jq \ jq \
...@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
rocm-device-libs \ rocm-device-libs \
rocm-cmake \ rocm-cmake \
vim \ vim \
nano \
zlib1g-dev \ zlib1g-dev \
openssh-server \ openssh-server \
clang-format-10 \ clang-format-10 \
...@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
#Install latest version of cmake
RUN apt purge --auto-remove -y cmake
RUN apt update
RUN apt install -y software-properties-common lsb-release
RUN apt clean all
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
RUN apt install -y kitware-archive-keyring
RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
RUN apt install -y cmake
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1 ENV UBSAN_OPTIONS=print_stacktrace=1
...@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit ...@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
RUN sh -c "echo compiler version = '$compiler_version'" RUN sh -c "echo compiler version = '$compiler_version'"
RUN sh -c "echo compiler commit = '$compiler_commit'" RUN sh -c "echo compiler commit = '$compiler_commit'"
RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
fi
RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \ cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
...@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com ...@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
......
...@@ -19,12 +19,33 @@ def runShell(String command){ ...@@ -19,12 +19,33 @@ def runShell(String command){
def getDockerImageName(){ def getDockerImageName(){
def img def img
if (params.COMPILER_COMMIT == ""){ if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
}
else{
if (params.COMPILER_COMMIT == ""){
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
}
else{
def commit = "${params.COMPILER_COMMIT}"[0..6]
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
}
}
} }
else{ else{
def commit = "${params.COMPILER_COMMIT}"[0..6] if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}"
}
else{
if (params.COMPILER_COMMIT == ""){
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
}
else{
def commit = "${params.COMPILER_COMMIT}"[0..6]
img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
}
}
} }
return img return img
} }
...@@ -49,11 +70,11 @@ def build_compiler(){ ...@@ -49,11 +70,11 @@ def build_compiler(){
compiler = '/opt/rocm/bin/hipcc' compiler = '/opt/rocm/bin/hipcc'
} }
else{ else{
if (params.COMPILER_VERSION == "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
compiler = "/opt/rocm/llvm/bin/clang++" compiler = "/llvm-project/build/bin/clang++"
} }
else{ else{
compiler = "/llvm-project/build/bin/clang++" compiler = "/opt/rocm/llvm/bin/clang++"
} }
} }
return compiler return compiler
...@@ -232,7 +253,7 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -232,7 +253,7 @@ def buildHipClangJob(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -287,7 +308,7 @@ def runCKProfiler(Map conf=[:]){ ...@@ -287,7 +308,7 @@ def runCKProfiler(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -420,7 +441,7 @@ def Build_CK(Map conf=[:]){ ...@@ -420,7 +441,7 @@ def Build_CK(Map conf=[:]){
dockerOpts = dockerOpts + " --env HSA_XNACK=1 " dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -576,7 +597,7 @@ def process_results(Map conf=[:]){ ...@@ -576,7 +597,7 @@ def process_results(Map conf=[:]){
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
0 21 * * * % COMPILER_VERSION=release;COMPILER_COMMIT= 0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
pipeline { pipeline {
...@@ -594,16 +615,16 @@ pipeline { ...@@ -594,16 +615,16 @@ pipeline {
description: "Force building docker image (default: false), set to true if docker image needs to be updated.") description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '5.4.3', defaultValue: '5.6',
description: 'Specify which ROCM version to use: 5.4.3 (default).') description: 'Specify which ROCM version to use: 5.6 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: 'amd-stg-open', defaultValue: '',
description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).') description: 'Specify which version of compiler to use: release, amd-stg-open, or leave blank (default).')
string( string(
name: 'COMPILER_COMMIT', name: 'COMPILER_COMMIT',
defaultValue: '5541927df00eabd6a110180170eca7785d436ee3', defaultValue: '',
description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.') description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.')
string( string(
name: 'BUILD_COMPILER', name: 'BUILD_COMPILER',
defaultValue: 'hipcc', defaultValue: 'hipcc',
......
...@@ -7,7 +7,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi ...@@ -7,7 +7,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
* A tile-based programming model * A tile-based programming model
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". * Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
![ALT](/doc/image/ck_component.png "CK Components") ![ALT](/docs/data/ck_component.png "CK Components")
## Code Structure ## Code Structure
Current CK library are structured into 4 layers: Current CK library are structured into 4 layers:
...@@ -16,7 +16,17 @@ Current CK library are structured into 4 layers: ...@@ -16,7 +16,17 @@ Current CK library are structured into 4 layers:
* "Instantiated Kernel and Invoker" layer * "Instantiated Kernel and Invoker" layer
* "Client API" layer * "Client API" layer
![ALT](/doc/image/ck_layer.png "CK Layers") ![ALT](/docs/data/ck_layer.png "CK Layers")
## Documentation
Run the steps below to build documentation locally.
```
cd docs
pip3 install -r .sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Contributors ## Contributors
The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md) The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
......
add_executable(client_contraction_scale contraction_scale.cpp) add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations) target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_operations)
add_executable(client_contraction_bilinear contraction_bilinear.cpp) add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp)
target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations) target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_operations)
add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp)
target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_operations)
add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp)
target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_operations)
add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp) add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp)
target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations) target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
#include "ck/library/utility/numeric.hpp"
using F64 = double;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Bilinear;
using ADataType = F64;
using BDataType = F64;
using AccDataType = F64;
using CShuffleDataType = F64;
using DDataType = F64;
using DsDataType = ck::Tuple<DDataType>;
using EDataType = F64;
static constexpr ck::index_t NumDimM = 2;
static constexpr ck::index_t NumDimN = 2;
static constexpr ck::index_t NumDimK = 2;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// kknn
#if 1
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// knnn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mknn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mnnn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
#endif
float alpha = 1.f;
float beta = 1.f;
if(argc == 1)
{
// use default case
}
else if(argc == 25)
{
const ck::index_t M0 = std::stoi(argv[1]);
const ck::index_t M1 = std::stoi(argv[2]);
const ck::index_t N0 = std::stoi(argv[3]);
const ck::index_t N1 = std::stoi(argv[4]);
const ck::index_t K0 = std::stoi(argv[5]);
const ck::index_t K1 = std::stoi(argv[6]);
a_ms_ks_lengths = {M0, M1, K0, K1};
a_ms_ks_strides = {
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
b_ns_ks_lengths = {N0, N1, K0, K1};
b_ns_ks_strides = {
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
d_ms_ns_lengths = {M0, M1, N0, N1};
d_ms_ns_strides = {
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
e_ms_ns_lengths = {M0, M1, N0, N1};
e_ms_ns_strides = {
std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
alpha = std::stof(argv[23]);
beta = std::stof(argv[24]);
}
else
{
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
printf("arg23 to 24: alpha, beta\n");
exit(0);
}
auto f_tensor_space_size = [](auto lengths, auto strides) {
std::size_t space_size = 1;
for(std::size_t i = 0; i < lengths.size(); ++i)
{
space_size += (lengths[i] - 1) * strides[i];
}
return space_size;
};
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
SimpleDeviceMem d_device_buf(sizeof(DDataType) *
f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
NumDimM,
NumDimN,
NumDimK,
ADataType,
BDataType,
ck::Tuple<DDataType>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::Bilinear>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{alpha, beta};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
a_ms_ks_lengths,
a_ms_ks_strides,
b_ns_ks_lengths,
b_ns_ks_strides,
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
e_ms_ns_lengths,
e_ms_ns_strides,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
ck::index_t M = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
ck::index_t N = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
ck::index_t K = ck::accumulate_n<ck::index_t>(
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
#include "ck/library/utility/numeric.hpp"
using F64 = double;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Scale = ck::tensor_operation::element_wise::Scale;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Scale;
using ADataType = F64;
using BDataType = F64;
using AccDataType = F64;
using CShuffleDataType = F64;
using DsDataType = ck::Tuple<>;
using EDataType = F64;
static constexpr ck::index_t NumDimM = 2;
static constexpr ck::index_t NumDimN = 2;
static constexpr ck::index_t NumDimK = 2;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// kkn
#if 1
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// knn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mkn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
// mnn
#elif 0
// A[M0, M1, K0, K1]
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
// B[N0, N1, K0, K1]
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
// D[M0, M1, N0, N1]
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
// E[M0, M1, N0, N1]
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
#endif
float scale = 1.f;
if(argc == 1)
{
// use default case
}
else if(argc == 20)
{
const ck::index_t M0 = std::stoi(argv[1]);
const ck::index_t M1 = std::stoi(argv[2]);
const ck::index_t N0 = std::stoi(argv[3]);
const ck::index_t N1 = std::stoi(argv[4]);
const ck::index_t K0 = std::stoi(argv[5]);
const ck::index_t K1 = std::stoi(argv[6]);
a_ms_ks_lengths = {M0, M1, K0, K1};
a_ms_ks_strides = {
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
b_ns_ks_lengths = {N0, N1, K0, K1};
b_ns_ks_strides = {
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
e_ms_ns_lengths = {M0, M1, N0, N1};
e_ms_ns_strides = {
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
scale = std::stof(argv[19]);
}
else
{
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
printf("arg19: scale\n");
exit(0);
}
auto f_tensor_space_size = [](auto lengths, auto strides) {
std::size_t space_size = 1;
for(std::size_t i = 0; i < lengths.size(); ++i)
{
space_size += (lengths[i] - 1) * strides[i];
}
return space_size;
};
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
NumDimM,
NumDimN,
NumDimK,
ADataType,
BDataType,
ck::Tuple<>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::Scale>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{scale};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
e_device_buf.GetDeviceBuffer(),
a_ms_ks_lengths,
a_ms_ks_strides,
b_ns_ks_lengths,
b_ns_ks_strides,
std::array<std::vector<ck::index_t>, 0>{},
std::array<std::vector<ck::index_t>, 0>{},
e_ms_ns_lengths,
e_ms_ns_strides,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
ck::index_t M = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
ck::index_t N = ck::accumulate_n<ck::index_t>(
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
ck::index_t K = ck::accumulate_n<ck::index_t>(
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return 0;
}
...@@ -17,22 +17,22 @@ using InDataType = ck::half_t; ...@@ -17,22 +17,22 @@ using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 32; static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 256; static constexpr ck::index_t N = 256; // batch size
static constexpr ck::index_t K = 192; static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 192; static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -52,50 +52,24 @@ struct SimpleDeviceMem ...@@ -52,50 +52,24 @@ struct SimpleDeviceMem
int main() int main()
{ {
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C}; // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1}; // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K}; std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
std::rotate(
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
std::rotate(
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
std::rotate(
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
std::rotate(
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
std::rotate(
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
std::rotate(
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
...@@ -155,9 +129,9 @@ int main() ...@@ -155,9 +129,9 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X; std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(WeiDataType) * G * K * Y * X * C + sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * G * N * Ho * Wo * K; sizeof(OutDataType) * N * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
......
add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)
add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp) add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations)
add_executable(client_conv2d_fwd_bias_tanh_perlayer_quantization conv2d_fwd_bias_tanh_perlayer_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_tanh_perlayer_quantization PRIVATE composable_kernel::device_operations)
add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp) add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations)
...@@ -9,3 +15,6 @@ target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composab ...@@ -9,3 +15,6 @@ target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composab
add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp) add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp)
target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations)
add_executable(client_gemm_quantization gemm_quantization.cpp)
target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
...@@ -17,27 +17,26 @@ using BiasDataType = int32_t; ...@@ -17,27 +17,26 @@ using BiasDataType = int32_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::Relu; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 32; static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; static constexpr ck::index_t Wo = 36; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
SimpleDeviceMem() = delete; SimpleDeviceMem() = delete;
...@@ -56,26 +55,30 @@ struct SimpleDeviceMem ...@@ -56,26 +55,30 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial, NumDimSpatial,
...@@ -136,10 +139,11 @@ int main(int argc, char* argv[]) ...@@ -136,10 +139,11 @@ int main(int argc, char* argv[])
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + std::size_t num_bytes =
G * sizeof(WeiDataType) * K * Y * X * C + G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
G * sizeof(OutDataType) * N * Ho * Wo * K; G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
G * sizeof(OutDataType) * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -162,11 +166,12 @@ int main(int argc, char* argv[]) ...@@ -162,11 +166,12 @@ int main(int argc, char* argv[])
} }
} }
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance // run the best intance
if(best_op_id != -1)
{ {
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl; << std::endl;
...@@ -202,4 +207,4 @@ int main(int argc, char* argv[]) ...@@ -202,4 +207,4 @@ int main(int argc, char* argv[])
} }
return 0; return 0;
} }
\ No newline at end of file
...@@ -16,25 +16,26 @@ using WeiDataType = int8_t; ...@@ -16,25 +16,26 @@ using WeiDataType = int8_t;
using BiasDataType = int32_t; using BiasDataType = int32_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K; using BiasLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::Relu; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 32; static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; static constexpr ck::index_t Wo = 36; // output W
static constexpr float requant_scale = 0.5f; // requantize qAcc to qz
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -54,23 +55,27 @@ struct SimpleDeviceMem ...@@ -54,23 +55,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
...@@ -102,26 +107,27 @@ int main(int argc, char* argv[]) ...@@ -102,26 +107,27 @@ int main(int argc, char* argv[])
for(int i = 0; i < op_ptrs.size(); ++i) for(int i = 0; i < op_ptrs.size(); ++i)
{ {
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr =
wei.GetDeviceBuffer(), op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
{bias.GetDeviceBuffer()}, wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), {bias.GetDeviceBuffer()},
in_lengths, out.GetDeviceBuffer(),
in_strides, in_lengths,
weight_lengths, in_strides,
weight_strides, weight_lengths,
{bias_lengths}, weight_strides,
{bias_strides}, {bias_lengths},
out_lengths, {bias_strides},
out_strides, out_lengths,
conv_strides, out_strides,
conv_dilations, conv_strides,
in_left_pad, conv_dilations,
in_right_pad, in_left_pad,
PassThrough{}, in_right_pad,
PassThrough{}, PassThrough{},
OutElementOp{0.5f, ActivationOp{}}); PassThrough{},
OutElementOp{requant_scale, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
...@@ -130,10 +136,10 @@ int main(int argc, char* argv[]) ...@@ -130,10 +136,10 @@ int main(int argc, char* argv[])
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + std::size_t num_bytes =
G * sizeof(WeiDataType) * K * Y * X * C + G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
G * sizeof(OutDataType) * N * Ho * Wo * K; G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -156,33 +162,35 @@ int main(int argc, char* argv[]) ...@@ -156,33 +162,35 @@ int main(int argc, char* argv[])
} }
} }
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance // run the best intance
if(best_op_id != -1)
{ {
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr =
wei.GetDeviceBuffer(), op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
{bias.GetDeviceBuffer()}, wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), {bias.GetDeviceBuffer()},
in_lengths, out.GetDeviceBuffer(),
in_strides, in_lengths,
weight_lengths, in_strides,
weight_strides, weight_lengths,
{bias_lengths}, weight_strides,
{bias_strides}, {bias_lengths},
out_lengths, {bias_strides},
out_strides, out_lengths,
conv_strides, out_strides,
conv_dilations, conv_strides,
in_left_pad, conv_dilations,
in_right_pad, in_left_pad,
PassThrough{}, in_right_pad,
PassThrough{}, PassThrough{},
OutElementOp{0.5f, ActivationOp{}}); PassThrough{},
OutElementOp{requant_scale, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using BiasDataType = int32_t;
using RequantScaleDataType = float;
using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::TanH;
using OutElementOp =
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; // output W
static constexpr float sz_inv = 0.5f; // inverse of scale_z
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, RequantScaleLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<BiasDataType, RequantScaleDataType>,
OutDataType,
PassThrough,
PassThrough,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
weight_lengths,
weight_strides,
{bias_lengths, requant_scale_lengths},
{bias_strides, requant_scale_strides},
out_lengths,
out_strides,
conv_strides,
conv_dilations,
in_left_pad,
in_right_pad,
PassThrough{},
PassThrough{},
OutElementOp{sz_inv, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes =
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
G * sizeof(OutDataType) * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(best_op_id != -1)
{
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr =
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
weight_lengths,
weight_strides,
{bias_lengths, requant_scale_lengths},
{bias_strides, requant_scale_strides},
out_lengths,
out_strides,
conv_strides,
conv_dilations,
in_left_pad,
in_right_pad,
PassThrough{},
PassThrough{},
OutElementOp{sz_inv, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using BiasDataType = int32_t;
using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using BiasLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = ck::tensor_operation::element_wise::TanH;
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; // output W
static constexpr float sacc = 0.5f; // scale of acc
static constexpr float sz_inv = 0.5f; // inverse of scale_z
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<BiasDataType>,
OutDataType,
PassThrough,
PassThrough,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{bias.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
weight_lengths,
weight_strides,
{bias_lengths},
{bias_strides},
out_lengths,
out_strides,
conv_strides,
conv_dilations,
in_left_pad,
in_right_pad,
PassThrough{},
PassThrough{},
OutElementOp{sacc, sz_inv, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes =
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(best_op_id != -1)
{
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{bias.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
weight_lengths,
weight_strides,
{bias_lengths},
{bias_strides},
out_lengths,
out_strides,
conv_strides,
conv_dilations,
in_left_pad,
in_right_pad,
PassThrough{},
PassThrough{},
OutElementOp{sacc, sz_inv, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
\ No newline at end of file
...@@ -16,25 +16,25 @@ using WeiDataType = int8_t; ...@@ -16,25 +16,25 @@ using WeiDataType = int8_t;
using RequantScaleDataType = float; using RequantScaleDataType = float;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using RequantScaleLayout = ck::tensor_layout::convolution::G_K; using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough; using ActivationOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 32; static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; static constexpr ck::index_t Wo = 36; // output W
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -54,23 +54,27 @@ struct SimpleDeviceMem ...@@ -54,23 +54,27 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo}; std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0}; std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
...@@ -130,10 +134,10 @@ int main(int argc, char* argv[]) ...@@ -130,10 +134,10 @@ int main(int argc, char* argv[])
{ {
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + std::size_t num_bytes =
G * sizeof(WeiDataType) * K * Y * X * C + G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
G * sizeof(OutDataType) * N * Ho * Wo * K; G * sizeof(RequantScaleDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -156,11 +160,12 @@ int main(int argc, char* argv[]) ...@@ -156,11 +160,12 @@ int main(int argc, char* argv[])
} }
} }
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance // run the best intance
if(best_op_id != -1)
{ {
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl; << std::endl;
...@@ -195,4 +200,4 @@ int main(int argc, char* argv[]) ...@@ -195,4 +200,4 @@ int main(int argc, char* argv[])
} }
return 0; return 0;
} }
\ No newline at end of file
...@@ -15,24 +15,25 @@ using InDataType = int8_t; ...@@ -15,24 +15,25 @@ using InDataType = int8_t;
using WeiDataType = int8_t; using WeiDataType = int8_t;
using OutDataType = int8_t; using OutDataType = int8_t;
using InLayout = ck::tensor_layout::convolution::GNHWC; using InLayout = ck::tensor_layout::convolution::NHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::NHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough; using ActivationOp = PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>; using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 4;
static constexpr ck::index_t N = 4; static constexpr ck::index_t N = 4; // batch size
static constexpr ck::index_t K = 64; static constexpr ck::index_t K = 32; // output channel
static constexpr ck::index_t C = 32; static constexpr ck::index_t C = 64; // input channel (per group)
static constexpr ck::index_t Y = 3; static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 71; static constexpr ck::index_t Hi = 71; // input H
static constexpr ck::index_t Wi = 71; static constexpr ck::index_t Wi = 71; // input W
static constexpr ck::index_t Ho = 36; static constexpr ck::index_t Ho = 36; // output H
static constexpr ck::index_t Wo = 36; static constexpr ck::index_t Wo = 36; // output W
static constexpr float requant_scale = 0.5f; // requantize qAcc to qY
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -52,20 +53,24 @@ struct SimpleDeviceMem ...@@ -52,20 +53,24 @@ struct SimpleDeviceMem
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi}; std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X}; std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo}; std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
std::array<ck::index_t, 2> in_left_pad{1, 1}; std::array<ck::index_t, 2> in_left_pad{1, 1};
std::array<ck::index_t, 2> in_right_pad{1, 1}; std::array<ck::index_t, 2> in_right_pad{1, 1};
std::array<ck::index_t, 2> conv_strides{2, 2}; std::array<ck::index_t, 2> conv_strides{2, 2};
std::array<ck::index_t, 2> conv_dilations{1, 1}; std::array<ck::index_t, 2> conv_dilations{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial, using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
InLayout, InLayout,
...@@ -96,26 +101,27 @@ int main(int argc, char* argv[]) ...@@ -96,26 +101,27 @@ int main(int argc, char* argv[])
for(int i = 0; i < op_ptrs.size(); ++i) for(int i = 0; i < op_ptrs.size(); ++i)
{ {
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr =
wei.GetDeviceBuffer(), op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
{}, wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), {},
in_lengths, out.GetDeviceBuffer(),
in_strides, in_lengths,
weight_lengths, in_strides,
weight_strides, weight_lengths,
{}, weight_strides,
{}, {},
out_lengths, {},
out_strides, out_lengths,
conv_strides, out_strides,
conv_dilations, conv_strides,
in_left_pad, conv_dilations,
in_right_pad, in_left_pad,
PassThrough{}, in_right_pad,
PassThrough{}, PassThrough{},
OutElementOp{0.5f, ActivationOp{}}); PassThrough{},
OutElementOp{requant_scale, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
...@@ -150,33 +156,34 @@ int main(int argc, char* argv[]) ...@@ -150,33 +156,34 @@ int main(int argc, char* argv[])
} }
} }
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops if(best_op_id != -1)
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{ {
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr =
wei.GetDeviceBuffer(), op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
{}, wei.GetDeviceBuffer(),
out.GetDeviceBuffer(), {},
in_lengths, out.GetDeviceBuffer(),
in_strides, in_lengths,
weight_lengths, in_strides,
weight_strides, weight_lengths,
{}, weight_strides,
{}, {},
out_lengths, {},
out_strides, out_lengths,
conv_strides, out_strides,
conv_dilations, conv_strides,
in_left_pad, conv_dilations,
in_right_pad, in_left_pad,
PassThrough{}, in_right_pad,
PassThrough{}, PassThrough{},
OutElementOp{0.5f, ActivationOp{}}); PassThrough{},
OutElementOp{requant_scale, ActivationOp{}});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment