"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "fa9ed456cba46add99c20442e3d27b7dc033bc6e"
Commit 1c54a541 authored by Artur Wojcik's avatar Artur Wojcik
Browse files

Merge branch 'uif2-initial' into uif2-migraphx

parents ca74a0e7 a1153df6
...@@ -388,6 +388,12 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${ ...@@ -388,6 +388,12 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${
if (NOT CK_BUILD_JIT_LIB) if (NOT CK_BUILD_JIT_LIB)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror -Weverything)
endif()
#add flags to reduce the size of binaries
add_compile_options(-Oz -flto=thin)
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp") file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
...@@ -399,35 +405,27 @@ if (NOT CK_BUILD_JIT_LIB) ...@@ -399,35 +405,27 @@ if (NOT CK_BUILD_JIT_LIB)
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance) file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0) set(add_inst 0)
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8") if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
#message("fp8 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8") if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
#message("bf8 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16") if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
#message("fp16 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32") if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
#message("fp32 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64") if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
#message("fp64 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16") if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
#message("bf16 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8") if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
#message("int8 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES") if(NOT "${cmake_instance}" MATCHES "DTYPES")
#message("instance should be built for all types!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES) if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
......
...@@ -26,25 +26,37 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ ...@@ -26,25 +26,37 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN amdgpu-install -y --usecase=rocm --no-dkms RUN amdgpu-install -y --usecase=rocm --no-dkms
## Sccache binary built from source for ROCm
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
# Install dependencies # Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
build-essential \ build-essential \
ccache \
cmake \ cmake \
ccache \
git \ git \
hip-rocclr \ hip-rocclr \
iputils-ping \
jq \ jq \
libelf-dev \ libelf-dev \
libncurses5-dev \ libncurses5-dev \
libnuma-dev \ libnuma-dev \
libpthread-stubs0-dev \ libpthread-stubs0-dev \
llvm-amdgpu \ llvm-amdgpu \
net-tools \
pkg-config \ pkg-config \
python \ python \
python3 \ python3 \
python3-dev \ python3-dev \
python3-pip \ python3-pip \
redis \
sshpass \ sshpass \
stunnel \
software-properties-common \ software-properties-common \
vim \ vim \
nano \ nano \
...@@ -62,7 +74,7 @@ RUN gunzip /usr/local/bin/ninja.gz ...@@ -62,7 +74,7 @@ RUN gunzip /usr/local/bin/ninja.gz
RUN chmod a+x /usr/local/bin/ninja RUN chmod a+x /usr/local/bin/ninja
RUN git clone https://github.com/nico/ninjatracing.git RUN git clone https://github.com/nico/ninjatracing.git
# Update the cmake to the latest version # Update the cmake to the latest version
RUN pip install --upgrade cmake RUN pip install --upgrade cmake==3.27.5
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
...@@ -77,9 +89,9 @@ ARG PREFIX=/opt/rocm ...@@ -77,9 +89,9 @@ ARG PREFIX=/opt/rocm
RUN pip3 install --upgrade pip RUN pip3 install --upgrade pip
RUN pip3 install sqlalchemy==1.4.46 RUN pip3 install sqlalchemy==1.4.46
RUN pip3 install pymysql RUN pip3 install pymysql
RUN pip3 install pandas RUN pip3 install pandas==2.0.3
RUN pip3 install setuptools-rust RUN pip3 install setuptools-rust
RUN pip3 install sshtunnel RUN pip3 install sshtunnel==0.4.0
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
ENV UBSAN_OPTIONS=print_stacktrace=1 ENV UBSAN_OPTIONS=print_stacktrace=1
...@@ -115,6 +127,8 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; ...@@ -115,6 +127,8 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ];
else echo "using the release compiler"; \ else echo "using the release compiler"; \
fi fi
#clean-up the deb package
RUN sh -c "rm -rf amdgpu-install*"
#ENV HIP_CLANG_PATH='/llvm-project/build/bin' #ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'" #RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
...@@ -65,10 +65,10 @@ def getDockerImageName(){ ...@@ -65,10 +65,10 @@ def getDockerImageName(){
} }
def check_host() { def check_host() {
if ("${env.CK_CCACHE}" != "null"){ if ("${env.CK_SCCACHE}" != "null"){
def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}" def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
echo "ccache server: ${CCACHE_SERVER}" echo "sccache server: ${SCCACHE_SERVER}"
sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt''' sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
def output = readFile(file: "tmp.txt") def output = readFile(file: "tmp.txt")
echo "tmp.txt contents: \$output" echo "tmp.txt contents: \$output"
return (output != "0") return (output != "0")
...@@ -96,24 +96,9 @@ def build_compiler(){ ...@@ -96,24 +96,9 @@ def build_compiler(){
def getDockerImage(Map conf=[:]){ def getDockerImage(Map conf=[:]){
env.DOCKER_BUILDKIT=1 env.DOCKER_BUILDKIT=1
def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm def prefixpath = conf.get("prefixpath", "/opt/rocm")
def no_cache = conf.get("no_cache", false) def no_cache = conf.get("no_cache", false)
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
echo "ccache server: ${env.CK_CCACHE}"
if(env.CK_CCACHE)
{
if(check_host())
{
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
}
else
{
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
}
if(no_cache) if(no_cache)
{ {
dockerArgs = dockerArgs + " --no-cache " dockerArgs = dockerArgs + " --no-cache "
...@@ -142,21 +127,6 @@ def buildDocker(install_prefix){ ...@@ -142,21 +127,6 @@ def buildDocker(install_prefix){
def image_name = getDockerImageName() def image_name = getDockerImageName()
echo "Building Docker for ${image_name}" echo "Building Docker for ${image_name}"
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
echo "ccache server: ${env.CK_CCACHE}"
if(env.CK_CCACHE)
{
if(check_host())
{
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
}
else
{
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
}
echo "Build Args: ${dockerArgs}" echo "Build Args: ${dockerArgs}"
try{ try{
...@@ -219,13 +189,9 @@ def cmake_build(Map conf=[:]){ ...@@ -219,13 +189,9 @@ def cmake_build(Map conf=[:]){
}else{ }else{
setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
} }
if(env.CK_CCACHE)
{
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
}
echo "ccache server: ${env.CK_CCACHE}"
def pre_setup_cmd = """ def pre_setup_cmd = """
#!/bin/bash
echo \$HSA_ENABLE_SDMA echo \$HSA_ENABLE_SDMA
ulimit -c unlimited ulimit -c unlimited
rm -rf build rm -rf build
...@@ -234,6 +200,46 @@ def cmake_build(Map conf=[:]){ ...@@ -234,6 +200,46 @@ def cmake_build(Map conf=[:]){
mkdir install mkdir install
cd build cd build
""" """
def invocation_tag=""
if (setup_args.contains("gfx11")){
invocation_tag="gfx11"
}
if (setup_args.contains("gfx10")){
invocation_tag="gfx10"
}
if (setup_args.contains("gfx90")){
invocation_tag="gfx90"
}
if (setup_args.contains("gfx94")){
invocation_tag="gfx94"
}
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
pre_setup_cmd = pre_setup_cmd + """
#!/bin/bash
export ROCM_PATH=/opt/rocm
export SCCACHE_ENABLED=true
export SCCACHE_LOG_LEVEL=debug
export SCCACHE_IDLE_TIMEOUT=14400
export COMPILERS_HASH_DIR=/tmp/.sccache
export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
stunnel ../script/redis-cli.conf
(
set -e
../script/sccache_wrapper.sh --enforce_redis
)
error_code=\$?
if [ \$error_code -ne 0 ]; then
echo "could not connect to the redis server. using sccache locally."
../script/sccache_wrapper.sh
fi
"""
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
}
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory // reduce parallelism when compiling, clang uses too much memory
def nt = nthreads() def nt = nthreads()
...@@ -251,7 +257,7 @@ def cmake_build(Map conf=[:]){ ...@@ -251,7 +257,7 @@ def cmake_build(Map conf=[:]){
sh cmd sh cmd
// Only archive from master or develop // Only archive from master or develop
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) { if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
} }
} }
...@@ -635,7 +641,7 @@ def process_results(Map conf=[:]){ ...@@ -635,7 +641,7 @@ def process_results(Map conf=[:]){
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION= CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT= 0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
pipeline { pipeline {
agent none agent none
...@@ -682,7 +688,10 @@ pipeline { ...@@ -682,7 +688,10 @@ pipeline {
name: 'hipTensor_branch', name: 'hipTensor_branch',
defaultValue: 'mainline', defaultValue: 'mainline',
description: 'Specify which branch of hipTensor to use (default: mainline)') description: 'Specify which branch of hipTensor to use (default: mainline)')
booleanParam(
name: "USE_SCCACHE",
defaultValue: true,
description: "Use the sccache for building CK (default: ON)")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
......
# Composable Kernel # Composable Kernel
## Methodology The Composable Kernel (CK) library provides a programming model for writing performance-critical
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
uses general purpose kernel languages, such as HIP C++.
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. CK uses two concepts to achieve performance portability and code maintainability:
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model * A tile-based programming model
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". * Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
technique called *Tensor Coordinate Transformation*.
![ALT](/docs/data/ck_component.png "CK Components") ![ALT](/docs/data/ck_component.png "CK Components")
## Code Structure The current CK library is structured into four layers:
Current CK library are structured into 4 layers: * Templated Tile Operators
* "Templated Tile Operators" layer * Templated Kernel and Invoker
* "Templated Kernel and Invoker" layer * Instantiated Kernel and Invoker
* "Instantiated Kernel and Invoker" layer * Client API
* "Client API" layer
![ALT](/docs/data/ck_layer.png "CK Layers") ![ALT](/docs/data/ck_layer.png "CK Layers")
## Documentation ## General information
Run the steps below to build documentation locally. To build our documentation locally, use the following code:
``` ``` bash
cd docs cd docs
pip3 install -r sphinx/requirements.txt pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
``` ```
## Contributors You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
page.
The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
## Citation ```note
If you use CK, cite us as follows:
If you use CK, please use following citations: * [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???) This paper will be available on arXiv soon.
* [CITATION.cff](/CITATION.cff) * [CITATION.cff](/CITATION.cff)
```
## License CK is released under the **[MIT license](/LICENSE)**.
CK is released under the MIT license. [License File](/LICENSE) ## Building CK
We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).
# Build CK 1. To build a new Docker image, use the Dockerfile provided with the source code:
## Build docker image ```bash
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
```
```bash 2. Launch the Docker container:
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
```
Pre-built dockers are available from this public repo:
https://hub.docker.com/r/rocm/composable_kernel/tags
## Launch docker ```bash
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
ck:latest \
/bin/bash
```
```bash 3. Clone CK source code from the GitHub repository and start the build:
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
ck:latest \
/bin/bash
```
## Build CK ```bash
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
cd composable_kernel && \
mkdir build && \
cd build
```
```bash You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
mkdir build && cd build to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.
# Need to specify target ID, example below is for gfx908 and gfx90a ```bash
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
cmake \ If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
-D CMAKE_PREFIX_PATH=/opt/rocm \ supported by the current compiler (this may take a long time).
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the 4. Build the entire CK library:
current compiler.
```bash
make -j
```
Additional cmake flags can be used to significantly speed-up the build: 5. Install CK:
INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library ```bash
while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency. make -j install
```
DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances ## Optional post-install steps
of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl * Build examples and tests:
instances. Those instances are only needed for the NAVI2x platforms.
### Build examples and tests ```bash
make -j examples tests
```
```bash * Build and run all examples and tests:
make -j examples tests
make test ```bash
``` make -j check
```
Instructions for running each individual examples are under [example](/example) You can find instructions for running each individual example in [example](/example).
* Build ckProfiler:
## Build ckProfiler ```bash
make -j ckProfiler
```
You can find instructions for running ckProfiler in [profiler](/profiler).
Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
Additional cmake flags can be used to significantly speed-up the build:
* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
dependency and don't plan to run any examples or tests.
* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
other data types.
* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
`batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
other platforms have faster instances, such as `xdl` or `wmma`, available.
## Using sccache for building
The default CK Docker images come with a pre-installed version of sccache, which supports clang
being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
hours to 1-2 minutes. In order to invoke sccache, you need to run:
```bash ```bash
make -j ckProfiler sccache --start-server
``` ```
Instructions for running ckProfiler are under [profiler](/profiler)
## Install CK then add the following flags to the cmake command line:
```bash ```bash
make install -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
``` ```
You may need to clean up the build folder and repeat the cmake and make steps in order to take
advantage of the sccache during subsequent builds.
## Using CK as pre-built kernel library ## Using CK as pre-built kernel library
Instructions for using CK as a pre-built kernel library are under [client_example](/client_example) You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).
## Contributing ## Contributing to CK
When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run: When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
```bash ```bash
sudo script/install_precommit.sh sudo script/install_precommit.sh
``` ```
This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created. With this approach, `pre-commit` adds the appropriate hooks to your local repository and
automatically runs `clang-format` (and possibly additional checks) before any commit is created.
If you need to uninstall hooks from the repository, you can do so by running the following command: If you need to uninstall hooks from the repository, you can do so by running the following command:
...@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the ...@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the
script/uninstall_precommit.sh script/uninstall_precommit.sh
``` ```
If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command. If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
`git commit` command.
## Caveat
### Kernel Timing and Verification
CK's own kernel timer will warn up kernel once, and then run it multiple times
to get average kernel time. For some kernels that use atomic add, this will cause
output buffer to be accumulated multiple times, causing verification failure.
To work around it, do not use CK's own timer and do verification at the same time.
CK's own timer and verification in each example and ckProfiler can be enabled or
disabled from command line.
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
using InDataType = ck::half_t; using InDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::GNHWC; using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group) static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
...@@ -52,18 +52,18 @@ int main() ...@@ -52,18 +52,18 @@ int main()
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X}; std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo}; std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy) // We have NHWGC in memory space
// However, CK's API only accept length and stride with order of GNCHW // However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of stride // Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1}; std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C); SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C); SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
...@@ -93,6 +93,7 @@ int main() ...@@ -93,6 +93,7 @@ int main()
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
...@@ -112,7 +113,7 @@ int main() ...@@ -112,7 +113,7 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C; sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -149,6 +150,7 @@ int main() ...@@ -149,6 +150,7 @@ int main()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
......
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
using InDataType = ck::half_t; using InDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::GNHWC; using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1; static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group) static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H static constexpr ck::index_t Y = 3; // filter H
...@@ -52,11 +52,11 @@ int main() ...@@ -52,11 +52,11 @@ int main()
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X}; std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo}; std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy) // We have NHWGC in memory space
// However, CK's API only accept length and stride with order of GNCHW // However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of stride // Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1}; std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1}; std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
...@@ -64,7 +64,7 @@ int main() ...@@ -64,7 +64,7 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C); SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C); SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
...@@ -93,6 +93,7 @@ int main() ...@@ -93,6 +93,7 @@ int main()
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
...@@ -112,7 +113,7 @@ int main() ...@@ -112,7 +113,7 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C; sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time;
...@@ -149,6 +150,7 @@ int main() ...@@ -149,6 +150,7 @@ int main()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(), out.GetDeviceBuffer(),
G,
N, N,
C, C,
in_spatial_lengths, in_spatial_lengths,
......
...@@ -20,7 +20,7 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage ...@@ -20,7 +20,7 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage
bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
{ {
const auto G = conv_params.G_;
const auto N = conv_params.N_; const auto N = conv_params.N_;
const auto C = conv_params.C_; const auto C = conv_params.C_;
...@@ -31,7 +31,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -31,7 +31,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
C * ck::accumulate_n<ck::index_t>( C * ck::accumulate_n<ck::index_t>(
conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc = HostTensorDescriptor({NDoHoWo, CZYX}); const auto in_desc = HostTensorDescriptor({G, NDoHoWo, CZYX});
const auto out_desc = const auto out_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params); ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
...@@ -39,7 +39,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -39,7 +39,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
std::array<ck::index_t, 2> gemm_m_k_strides{}; std::array<ck::index_t, 3> gemm_g_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
...@@ -50,7 +50,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -50,7 +50,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
copy(conv_params.input_spatial_lengths_, input_spatial_lengths); copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_params.output_spatial_lengths_, output_spatial_lengths); copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), gemm_m_k_strides); copy(in_desc.GetStrides(), gemm_g_m_k_strides);
copy(out_desc.GetStrides(), image_g_n_c_wis_strides); copy(out_desc.GetStrides(), image_g_n_c_wis_strides);
copy(conv_params.conv_filter_strides_, conv_filter_strides); copy(conv_params.conv_filter_strides_, conv_filter_strides);
copy(conv_params.conv_filter_dilations_, conv_filter_dilations); copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
...@@ -86,13 +86,14 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -86,13 +86,14 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
auto invoker = col2img.MakeInvoker(); auto invoker = col2img.MakeInvoker();
auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(), auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -108,7 +109,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -108,7 +109,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); std::size_t num_btype = G * NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
......
...@@ -20,7 +20,7 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn ...@@ -20,7 +20,7 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn
bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
{ {
const auto G = conv_params.G_;
const auto N = conv_params.N_; const auto N = conv_params.N_;
const auto C = conv_params.C_; const auto C = conv_params.C_;
...@@ -33,13 +33,13 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -33,13 +33,13 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
const auto in_desc = const auto in_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params); ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); const auto out_desc = HostTensorDescriptor({G, NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
std::array<ck::index_t, 2> gemm_m_k_strides{}; std::array<ck::index_t, 3> gemm_g_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
...@@ -51,7 +51,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -51,7 +51,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_params.output_spatial_lengths_, output_spatial_lengths); copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), image_g_n_c_wis_strides); copy(in_desc.GetStrides(), image_g_n_c_wis_strides);
copy(out_desc.GetStrides(), gemm_m_k_strides); copy(out_desc.GetStrides(), gemm_g_m_k_strides);
copy(conv_params.conv_filter_strides_, conv_filter_strides); copy(conv_params.conv_filter_strides_, conv_filter_strides);
copy(conv_params.conv_filter_dilations_, conv_filter_dilations); copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
copy(conv_params.input_left_pads_, input_left_pads); copy(conv_params.input_left_pads_, input_left_pads);
...@@ -86,13 +86,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -86,13 +86,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
auto invoker = img2col.MakeInvoker(); auto invoker = img2col.MakeInvoker();
auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(), auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -108,7 +109,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv ...@@ -108,7 +109,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
} }
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); std::size_t num_btype = G * NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
......
...@@ -14,11 +14,12 @@ namespace device { ...@@ -14,11 +14,12 @@ namespace device {
/** /**
* \brief Convolution Tensor Rearrange. * \brief Convolution Tensor Rearrange.
* *
* This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to * This Device operator supports converting an image to
* the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and * the GEMM representation (Image to Column) and
* conversion gemm form to the image (Column to Image). * converting a GEMM form to the image (Column to Image).
* * Supported layouts:
* Note that G must be equal to 1. * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z * Y * X * C]
* [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z * Y * X * C]
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Input Layout. * \tparam ImageLayout Input Layout.
...@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator ...@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
* *
* \param p_in A pointer to the device memory of the input image. * \param p_in A pointer to the device memory of the input image.
* \param p_out A pointer to the device memory of the output. * \param p_out A pointer to the device memory of the output.
* \param G Convolution number of groups.
* \param N Convolution batch size. * \param N Convolution batch size.
* \param C Convolution number of channels. * \param C Convolution number of channels.
* \param input_spatial_lengths Input spatial lengths. * \param input_spatial_lengths Input spatial lengths.
* \param filter_spatial_lengths Filter spatial lengths. * \param filter_spatial_lengths Filter spatial lengths.
* \param output_spatial_lengths Output spatial lengths. * \param output_spatial_lengths Output spatial lengths.
* \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W]. * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
* \param gemm_m_k_strides Gemm form strides. * \param gemm_g_m_k_strides Gemm form strides.
* \param conv_filter_strides Convolution filter strides. * \param conv_filter_strides Convolution filter strides.
* \param conv_filter_dilations Convolution filter dilations. * \param conv_filter_dilations Convolution filter dilations.
* \param input_left_pads Convolution left pads. * \param input_left_pads Convolution left pads.
...@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator ...@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
virtual std::unique_ptr<BaseArgument> virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, MakeArgumentPointer(const void* p_in,
void* p_out, void* p_out,
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
......
...@@ -17,15 +17,18 @@ ...@@ -17,15 +17,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// Image to column for input layout NDHWC: // Column to Image:
// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C] // input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C]
// output : image [N, Di, Hi, Wi, C] // output : input image [G, N, Di, Hi, Wi, C]
// input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C]
// output : input image [N, Di, Hi, Wi, G, C]
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ImageLayout, typename ImageLayout,
typename InputDataType, typename InputDataType,
...@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl ...@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
OutputDataType, OutputDataType,
conv_tensor_rearrange_op::ColumnToImage> conv_tensor_rearrange_op::ColumnToImage>
{ {
static constexpr bool is_NSpatialGC =
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
static constexpr bool is_GNSpatialC =
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl ...@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& independent_filters, const std::array<index_t, NDimSpatial>& independent_filters,
const std::array<index_t, NDimSpatial>& effs) const std::array<index_t, NDimSpatial>& effs)
{ {
...@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl ...@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1]; const index_t NStride = DoHoWo * gemm_g_m_k_strides[I1] * gemm_g_m_k_strides[I2];
// Calculate the appropriate stride for each set of independent filters // Calculate the appropriate stride for each set of independent filters
// in each dimension // in each dimension
const index_t WStride = const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) *
math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0]; gemm_g_m_k_strides[I1];
const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) * const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
output_spatial_lengths[XIdx] * gemm_m_k_strides[I0]; output_spatial_lengths[XIdx] * gemm_g_m_k_strides[I1];
const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) * const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
gemm_m_k_strides[I0]; gemm_g_m_k_strides[I1];
// Create descriptor for independent filters in each dimension and // Create descriptor for independent filters in each dimension and
// then merge them into column form // then merge them into column form
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const auto desc_gemm_form = const auto desc_gemm_form =
make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX), make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
make_tuple(NStride, WStride, gemm_m_k_strides[I1])); make_tuple(NStride, WStride, gemm_g_m_k_strides[I2]));
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
desc_gemm_form, desc_gemm_form,
make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])), make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
...@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl ...@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
{ {
const auto desc_gemm_form = make_naive_tensor_descriptor( const auto desc_gemm_form = make_naive_tensor_descriptor(
make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX), make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1])); make_tuple(NStride, HStride, WStride, gemm_g_m_k_strides[I2]));
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
desc_gemm_form, desc_gemm_form,
make_tuple(make_merge_transform( make_tuple(make_merge_transform(
...@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl ...@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
independent_filters[YIdx], independent_filters[YIdx],
independent_filters[XIdx], independent_filters[XIdx],
CZYX), CZYX),
make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1])); make_tuple(NStride, DStride, HStride, WStride, gemm_g_m_k_strides[I2]));
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
desc_gemm_form, desc_gemm_form,
make_tuple(make_merge_transform(make_tuple(N, make_tuple(make_merge_transform(make_tuple(N,
...@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl ...@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>( decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
InputGridDesc{}))>; InputGridDesc{}))>;
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc, using GridwiseTensorRearrangeKernel =
InputDataType, GridwiseTensorRearrange<InputGridDesc,
OutputGridDesc, InputDataType,
OutputDataType, OutputGridDesc,
BlockSize, OutputDataType,
MPerBlock, BlockSize,
KPerBlock, MPerBlock,
ThreadClusterLengths, KPerBlock,
ScalarPerVector, ThreadClusterLengths,
InMemoryDataOperationEnum::Add, ScalarPerVector,
Block2ETileMap>; InMemoryDataOperationEnum::Add,
Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const void* p_in, // input image Argument(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C), : G_(G),
C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]), X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)}, p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)}, p_out_{static_cast<OutputDataType*>(p_out)},
...@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl ...@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl
input_left_pads_{input_left_pads}, input_left_pads_{input_left_pads},
input_right_pads_{input_right_pads} input_right_pads_{input_right_pads}
{ {
compute_ptr_offset_of_batch_.BatchStrideA_ = gemm_g_m_k_strides[I0];
compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0];
const index_t x_eff = const index_t x_eff =
(filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1; (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
const index_t y_eff = const index_t y_eff =
...@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl ...@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
conv_filter_strides, conv_filter_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
independent_filters, independent_filters,
effs); effs);
const auto out_grid_desc_m_k = const auto out_grid_desc_m_k =
...@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl ...@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl
// Memory offsets to next set of independent filters, // Memory offsets to next set of independent filters,
// move to independent filters in each dimension // move to independent filters in each dimension
const index_t in_offset = const index_t in_offset =
x_idx * gemm_m_k_strides[0] + (x_idx + y_idx * output_spatial_lengths[XIdx] +
y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] + z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) *
z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] * gemm_g_m_k_strides[I1];
output_spatial_lengths[XIdx];
// Move to independent filters in appropriate dimensions // Move to independent filters in appropriate dimensions
const index_t out_offset = const index_t out_offset =
x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] + x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
...@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl ...@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl
} }
} }
const ck::index_t G_;
const ck::index_t C_; const ck::index_t C_;
const ck::index_t X_; const ck::index_t X_;
...@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl ...@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl
std::vector<const InputDataType*> p_in_container_; std::vector<const InputDataType*> p_in_container_;
std::vector<OutputDataType*> p_out_container_; std::vector<OutputDataType*> p_out_container_;
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
...@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl ...@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
Block2ETileMap, Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>,
GridwiseTensorRearrangeKernel>; GridwiseTensorRearrangeKernel>;
// Execute each set of independent filters // Execute each set of independent filters
...@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl ...@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>( BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
arg.out_grid_desc_m_k_container_[i]); arg.out_grid_desc_m_k_container_[i]);
const index_t grid_size = const index_t grid_size =
block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]); block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
elapsed_time += launch_and_time_kernel(stream_config, elapsed_time += launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(grid_size), dim3(grid_size),
...@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl ...@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl
arg.p_in_container_[i], arg.p_in_container_[i],
arg.out_grid_desc_m_k_container_[i], arg.out_grid_desc_m_k_container_[i],
arg.p_out_container_[i], arg.p_out_container_[i],
block_2_tile_map); arg.G_,
block_2_tile_map,
arg.compute_ptr_offset_of_batch_);
} }
return elapsed_time; return elapsed_time;
} }
...@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl ...@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl
bool IsSupportedArgument(const Argument& arg) bool IsSupportedArgument(const Argument& arg)
{ {
using namespace tensor_layout::convolution; using namespace tensor_layout::convolution;
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> || if constexpr(!(is_NSpatialGC || is_GNSpatialC))
std::is_same_v<ImageLayout, GNDHWC>))
{ {
return false; return false;
} }
...@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl ...@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl
static auto MakeArgument(const void* p_in, // input image static auto MakeArgument(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl ...@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl
{ {
return Argument{static_cast<const InputDataType*>(p_in), return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl ...@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image MakeArgumentPointer(const void* p_in, // input image
void* p_out, // output image void* p_out, // output image
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl ...@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl
{ {
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in), return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -15,15 +15,18 @@ ...@@ -15,15 +15,18 @@
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
// Image to column for input layout NDHWC: // Image to column:
// input : input image [N, Di, Hi, Wi, C] // input : input image [G, N, Di, Hi, Wi, C]
// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C] // output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
// input : input image [N, Di, Hi, Wi, G, C]
// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename ImageLayout, typename ImageLayout,
typename InputDataType, typename InputDataType,
...@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl ...@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
OutputDataType, OutputDataType,
conv_tensor_rearrange_op::ImageToColumn> conv_tensor_rearrange_op::ImageToColumn>
{ {
static constexpr bool is_NSpatialGC =
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
static constexpr bool is_GNSpatialC =
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl ...@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, 2>& gemm_m_k_strides) const std::array<index_t, 3>& gemm_g_m_k_strides)
{ {
const index_t NDoHoWo = const index_t NDoHoWo =
N * ck::accumulate_n<index_t>( N * ck::accumulate_n<index_t>(
...@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl ...@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl
const index_t CZYX = const index_t CZYX =
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));
const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw); const auto desc_mraw_kraw = make_naive_tensor_descriptor(
return desc_m_k; make_tuple(NDoHoWo, CZYX), make_tuple(gemm_g_m_k_strides[I1], gemm_g_m_k_strides[I2]));
return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
} }
using InputGridDesc = using InputGridDesc =
...@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl ...@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>( decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
OutputGridDesc{}))>; OutputGridDesc{}))>;
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc, using GridwiseTensorRearrangeKernel =
InputDataType, GridwiseTensorRearrange<InputGridDesc,
OutputGridDesc, InputDataType,
OutputDataType, OutputGridDesc,
BlockSize, OutputDataType,
MPerBlock, BlockSize,
KPerBlock, MPerBlock,
ThreadClusterLengths, KPerBlock,
ScalarPerVector, ThreadClusterLengths,
InMemoryDataOperationEnum::Set, ScalarPerVector,
Block2ETileMap>; InMemoryDataOperationEnum::Set,
Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const void* p_in, // input image Argument(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads) const std::array<index_t, NDimSpatial>& input_right_pads)
: C_(C), : G_(G),
C_(C),
X_(filter_spatial_lengths[NDimSpatial - I1]), X_(filter_spatial_lengths[NDimSpatial - I1]),
p_in_{static_cast<const InputDataType*>(p_in)}, p_in_{static_cast<const InputDataType*>(p_in)},
p_out_{static_cast<OutputDataType*>(p_out)}, p_out_{static_cast<OutputDataType*>(p_out)},
...@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl ...@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
out_grid_desc_m_k_ = MakeOutDescriptor_M_K( out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides); N, C, filter_spatial_lengths, output_spatial_lengths, gemm_g_m_k_strides);
compute_ptr_offset_of_batch_.BatchStrideA_ = image_g_n_c_wis_strides[I0];
compute_ptr_offset_of_batch_.BatchStrideC_ = gemm_g_m_k_strides[I0];
} }
void Print() const void Print() const
...@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl ...@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl
std::cout << out_grid_desc_m_k_ << std::endl; std::cout << out_grid_desc_m_k_ << std::endl;
} }
const ck::index_t G_;
const ck::index_t C_; const ck::index_t C_;
const ck::index_t X_; const ck::index_t X_;
...@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl ...@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl
InputGridDesc in_grid_desc_m_k_; InputGridDesc in_grid_desc_m_k_;
OutputGridDesc out_grid_desc_m_k_; OutputGridDesc out_grid_desc_m_k_;
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
}; };
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
...@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl ...@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl
const auto block_2_tile_map = const auto block_2_tile_map =
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>( BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
arg.out_grid_desc_m_k_); arg.out_grid_desc_m_k_);
const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_); const index_t grid_size =
const auto kernel = kernel_tensor_rearrange<InputGridDesc, block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_) * arg.G_;
const auto kernel = kernel_tensor_rearrange<InputGridDesc,
InputDataType, InputDataType,
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
Block2ETileMap, Block2ETileMap,
ComputePtrOffsetOfStridedBatch<I0>,
GridwiseTensorRearrangeKernel>; GridwiseTensorRearrangeKernel>;
float elapsed_time = launch_and_time_kernel(stream_config, float elapsed_time = launch_and_time_kernel(stream_config,
...@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl ...@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl
arg.p_in_, arg.p_in_,
arg.out_grid_desc_m_k_, arg.out_grid_desc_m_k_,
arg.p_out_, arg.p_out_,
block_2_tile_map); arg.G_,
block_2_tile_map,
arg.compute_ptr_offset_of_batch_);
return elapsed_time; return elapsed_time;
} }
...@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl ...@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl
bool IsSupportedArgument(const Argument& arg) bool IsSupportedArgument(const Argument& arg)
{ {
using namespace tensor_layout::convolution; if constexpr(!(is_NSpatialGC || is_GNSpatialC))
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
std::is_same_v<ImageLayout, GNDHWC>))
{ {
return false; return false;
} }
...@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl ...@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl
static auto MakeArgument(const void* p_in, // input image static auto MakeArgument(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl ...@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl
{ {
return Argument{static_cast<const InputDataType*>(p_in), return Argument{static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl ...@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl
std::unique_ptr<BaseArgument> std::unique_ptr<BaseArgument>
MakeArgumentPointer(const void* p_in, // input image MakeArgumentPointer(const void* p_in, // input image
void* p_out, // gemm form void* p_out, // gemm form
const ck::index_t G,
const ck::index_t N, const ck::index_t N,
const ck::index_t C, const ck::index_t C,
const std::array<index_t, NDimSpatial>& input_spatial_lengths, const std::array<index_t, NDimSpatial>& input_spatial_lengths,
const std::array<index_t, NDimSpatial>& filter_spatial_lengths, const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
const std::array<index_t, NDimSpatial>& output_spatial_lengths, const std::array<index_t, NDimSpatial>& output_spatial_lengths,
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides, const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
const std::array<index_t, 2>& gemm_m_k_strides, const std::array<index_t, 3>& gemm_g_m_k_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides, const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations, const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads, const std::array<index_t, NDimSpatial>& input_left_pads,
...@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl ...@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl
{ {
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in), return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
static_cast<OutputDataType*>(p_out), static_cast<OutputDataType*>(p_out),
G,
N, N,
C, C,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
image_g_n_c_wis_strides, image_g_n_c_wis_strides,
gemm_m_k_strides, gemm_g_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -21,6 +21,7 @@ template <typename InputGridDesc, ...@@ -21,6 +21,7 @@ template <typename InputGridDesc,
typename OutputGridDesc, typename OutputGridDesc,
typename OutputDataType, typename OutputDataType,
typename Block2ETileMap, typename Block2ETileMap,
typename ComputePtrOffsetOfStridedBatch,
typename GridwiseTensorRearrangeKernel> typename GridwiseTensorRearrangeKernel>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
...@@ -30,13 +31,20 @@ __global__ void ...@@ -30,13 +31,20 @@ __global__ void
const InputDataType* __restrict__ p_in_global, const InputDataType* __restrict__ p_in_global,
const OutputGridDesc out_grid_desc, const OutputGridDesc out_grid_desc,
OutputDataType* __restrict__ p_out_global, OutputDataType* __restrict__ p_out_global,
const Block2ETileMap block_2_tile_map) const index_t batch_count,
const Block2ETileMap block_2_tile_map,
const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__)) defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
GridwiseTensorRearrangeKernel::Run( GridwiseTensorRearrangeKernel::Run(in_grid_desc,
in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map); p_in_global,
out_grid_desc,
p_out_global,
batch_count,
block_2_tile_map,
compute_ptr_offset_of_batch);
#else #else
ignore = in_grid_desc; ignore = in_grid_desc;
ignore = p_in_global; ignore = p_in_global;
...@@ -56,7 +64,8 @@ template <typename InputGridDesc, ...@@ -56,7 +64,8 @@ template <typename InputGridDesc,
typename ThreadClusterLengths, typename ThreadClusterLengths,
index_t ScalarPerVector, index_t ScalarPerVector,
InMemoryDataOperationEnum DstInMemOp, InMemoryDataOperationEnum DstInMemOp,
typename Block2ETileMap> typename Block2ETileMap,
typename ComputePtrOffsetOfStridedBatch>
struct GridwiseTensorRearrange struct GridwiseTensorRearrange
{ {
...@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange ...@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
const InputDataType* __restrict__ p_in_global, const InputDataType* __restrict__ p_in_global,
const OutputGridDesc& out_grid_desc, const OutputGridDesc& out_grid_desc,
OutputDataType* __restrict__ p_out_global, OutputDataType* __restrict__ p_out_global,
const Block2ETileMap& block_2_tile_map) const index_t batch_count,
const Block2ETileMap& block_2_tile_map,
const ComputePtrOffsetOfStridedBatch& compute_ptr_offset_of_batch)
{ {
const auto block_work_idx = const auto block_work_idx =
block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
...@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange ...@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
const index_t k_block_data_idx_on_grid = const index_t k_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock); __builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock);
// Global Memory
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global, in_grid_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global, out_grid_desc.GetElementSpaceSize());
auto copy_global_to_global = auto copy_global_to_global =
ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock, ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
Tuple<InputDataType>, Tuple<InputDataType>,
...@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange ...@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
tensor_operation::element_wise::PassThrough{}}; tensor_operation::element_wise::PassThrough{}};
const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
const index_t g_idx =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
// Global Memory
const index_t a_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
const index_t c_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
copy_global_to_global.Run( copy_global_to_global.Run(
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
} }
......
...@@ -19,9 +19,7 @@ namespace host { ...@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for column to image. * \brief Reference implementation for column to image.
* *
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout. * \tparam ImageLayout Image Layout.
...@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 && if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 &&
arg.input_.GetNumOfDimension() == 2)) arg.input_.GetNumOfDimension() == 3))
{ {
throw std::runtime_error("wrong! inconsistent dimension"); throw std::runtime_error("wrong! inconsistent dimension");
} }
const index_t G = arg.output_.GetLengths()[0];
const index_t N = arg.output_.GetLengths()[1]; const index_t N = arg.output_.GetLengths()[1];
const index_t C = arg.output_.GetLengths()[2]; const index_t C = arg.output_.GetLengths()[2];
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const index_t Wo = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[0];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
{ {
index_t row = n * Wo + wo; index_t row = n * Wo + wo;
...@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3])
{ {
float v_in = ck::type_convert<float>(arg.input_(row, column)); float v_in =
float v_out = ck::type_convert<float>(arg.output_(0, n, c, wi)); ck::type_convert<float>(arg.input_(g, row, column));
arg.output_(0, n, c, wi) = float v_out = ck::type_convert<float>(arg.output_(g, n, c, wi));
arg.output_(g, n, c, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Ho = arg.output_spatial_lengths_[0];
const index_t Wo = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[1];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t ho = 0; ho < Ho; ++ho) for(index_t ho = 0; ho < Ho; ++ho)
{ {
for(index_t wo = 0; wo < Wo; ++wo) for(index_t wo = 0; wo < Wo; ++wo)
...@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
arg.output_.GetLengths()[4]) arg.output_.GetLengths()[4])
{ {
float v_in = float v_in =
ck::type_convert<float>(arg.input_(row, column)); ck::type_convert<float>(arg.input_(g, row, column));
float v_out = ck::type_convert<float>( float v_out = ck::type_convert<float>(
arg.output_(0, n, c, hi, wi)); arg.output_(g, n, c, hi, wi));
arg.output_(0, n, c, hi, wi) = arg.output_(g, n, c, hi, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Ho = arg.output_spatial_lengths_[1];
const index_t Wo = arg.output_spatial_lengths_[2]; const index_t Wo = arg.output_spatial_lengths_[2];
auto func = [&](auto n) { auto func = [&](auto g, auto n) {
for(index_t d_o = 0; d_o < Do; ++d_o) for(index_t d_o = 0; d_o < Do; ++d_o)
{ {
for(index_t ho = 0; ho < Ho; ++ho) for(index_t ho = 0; ho < Ho; ++ho)
...@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
arg.output_.GetLengths()[5]) arg.output_.GetLengths()[5])
{ {
float v_in = ck::type_convert<float>( float v_in = ck::type_convert<float>(
arg.input_(row, column)); arg.input_(g, row, column));
float v_out = ck::type_convert<float>( float v_out = ck::type_convert<float>(
arg.output_(0, n, c, di, hi, wi)); arg.output_(g, n, c, di, hi, wi));
arg.output_(0, n, c, di, hi, wi) = arg.output_(g, n, c, di, hi, wi) =
ck::type_convert<OutDataType>(v_in + v_out); ck::type_convert<OutDataType>(v_in + v_out);
} }
column++; column++;
...@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator ...@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) && if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(G) &&
arg.input_.GetLengths()[1] == static_cast<std::size_t>(CZYX))) arg.input_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
arg.input_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
{ {
return false; return false;
} }
......
...@@ -19,9 +19,7 @@ namespace host { ...@@ -19,9 +19,7 @@ namespace host {
* \brief Reference implementation for image to column. * \brief Reference implementation for image to column.
* *
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. * Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
* Memory layout is the same.
* *
* \tparam NDimSpatial Number of spatial dimensions. * \tparam NDimSpatial Number of spatial dimensions.
* \tparam ImageLayout Image Layout. * \tparam ImageLayout Image Layout.
...@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 && if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
arg.output_.GetNumOfDimension() == 2)) arg.output_.GetNumOfDimension() == 3))
{ {
throw std::runtime_error("wrong! inconsistent dimension"); throw std::runtime_error("wrong! inconsistent dimension");
} }
const index_t G = arg.input_.GetLengths()[0];
const index_t N = arg.input_.GetLengths()[1]; const index_t N = arg.input_.GetLengths()[1];
const index_t C = arg.input_.GetLengths()[2]; const index_t C = arg.input_.GetLengths()[2];
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
const index_t Wo = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[0];
auto func = [&](auto n, auto wo) { auto func = [&](auto g, auto n, auto wo) {
index_t row = n * Wo + wo; index_t row = n * Wo + wo;
index_t column = 0; index_t column = 0;
...@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
if(wi >= 0 && if(wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
{ {
InDataType v_in = arg.input_(0, n, c, wi); InDataType v_in = arg.input_(g, n, c, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(g, row, column) = ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
} }
} }
}; };
make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Ho = arg.output_spatial_lengths_[0];
const index_t Wo = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[1];
auto func = [&](auto n, auto ho, auto wo) { auto func = [&](auto g, auto n, auto ho, auto wo) {
index_t row = n * Ho * Wo + ho * Wo + wo; index_t row = n * Ho * Wo + ho * Wo + wo;
index_t column = 0; index_t column = 0;
...@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
wi >= 0 && wi >= 0 &&
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4]) ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
{ {
InDataType v_in = arg.input_(0, n, c, hi, wi); InDataType v_in = arg.input_(g, n, c, hi, wi);
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in); arg.output_(g, row, column) =
ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
} }
...@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
return 0; return 0;
} }
...@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Ho = arg.output_spatial_lengths_[1];
const index_t Wo = arg.output_spatial_lengths_[2]; const index_t Wo = arg.output_spatial_lengths_[2];
auto func = [&](auto n, auto d_o, auto ho, auto wo) { auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
index_t column = 0; index_t column = 0;
...@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
ck::type_convert<std::size_t>(wi) < ck::type_convert<std::size_t>(wi) <
arg.input_.GetLengths()[5]) arg.input_.GetLengths()[5])
{ {
InDataType v_in = arg.input_(0, n, c, di, hi, wi); InDataType v_in = arg.input_(g, n, c, di, hi, wi);
arg.output_(row, column) = arg.output_(g, row, column) =
ck::type_convert<OutDataType>(v_in); ck::type_convert<OutDataType>(v_in);
} }
column++; column++;
...@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
} }
}; };
make_ParallelTensorFunctor(func, N, Do, Ho, Wo)( make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
...@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator ...@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
C * ck::accumulate_n<index_t>( C * ck::accumulate_n<index_t>(
arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) && if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(G) &&
arg.output_.GetLengths()[1] == static_cast<std::size_t>(CZYX))) arg.output_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
arg.output_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
{ {
return false; return false;
} }
......
add_instance_library(device_column_to_image_instance add_instance_library(device_column_to_image_instance
device_column_to_image_nhwc_1d_instance.cpp device_column_to_image_gnwc_1d_instance.cpp
device_column_to_image_nhwc_2d_instance.cpp device_column_to_image_gnhwc_2d_instance.cpp
device_column_to_image_nhwc_3d_instance.cpp device_column_to_image_gndhwc_3d_instance.cpp
device_column_to_image_nwgc_1d_instance.cpp
device_column_to_image_nhwgc_2d_instance.cpp
device_column_to_image_ndhwgc_3d_instance.cpp
) )
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_ndhwc_3d_bf16_instances( void add_device_column_to_image_gndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_f16_instances( void add_device_column_to_image_gndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_f32_instances( void add_device_column_to_image_gndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_ndhwc_3d_i8_instances( void add_device_column_to_image_gndhwc_3d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
......
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nhwc_2d_bf16_instances( void add_device_column_to_image_gnhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_f16_instances( void add_device_column_to_image_gnhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_f32_instances( void add_device_column_to_image_gnhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_nhwc_2d_i8_instances( void add_device_column_to_image_gnhwc_2d_i8_instances(
std::vector< std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>& std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
......
...@@ -11,7 +11,7 @@ namespace instance { ...@@ -11,7 +11,7 @@ namespace instance {
using namespace ck::conv_tensor_rearrange_op; using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_nwc_1d_bf16_instances( void add_device_column_to_image_gnwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances( ...@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_f16_instances( void add_device_column_to_image_gnwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
instances) instances)
{ {
...@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances( ...@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_f32_instances( void add_device_column_to_image_gnwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
instances) instances)
{ {
...@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances( ...@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
#endif #endif
} }
void add_device_column_to_image_nwc_1d_i8_instances( void add_device_column_to_image_gnwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>& std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
instances) instances)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_column_to_image_ndhwgc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_column_to_image_f16_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_column_to_image_f32_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
void add_device_column_to_image_ndhwgc_3d_i8_instances(
std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
instances)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_column_to_image_i8_instances<3, NDHWGC>{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment