Commit 11279540 authored by Astha Rai's avatar Astha Rai
Browse files

Merge branch 'transpose_5d' of github.com:ROCmSoftwarePlatform/composable_kernel into transpose_5d

parents 14daa201 33e78b9a
......@@ -14,10 +14,10 @@ None
### Additions
- Added an image to a column kernel (#867)
- Added a column to an image kernel (#930)
- Support for 3D grouped convolution forward on RDNA 3 GPUs (#935)
- Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
- Grouped convolution support for small K and C (#822 #879 #897)
- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Support for bf16/f32/f16 and NHWGC (2D and 3d) grouped convolution backward data (#757 #799)
- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
- Support for Batched Gemm DL (#732)
### Changes
......
......@@ -32,12 +32,10 @@ if (DTYPES)
if (DTYPES MATCHES "fp8")
add_definitions(-DCK_ENABLE_FP8)
set(CK_ENABLE_FP8 "ON")
add_compile_options(-Wno-bit-int-extension)
endif()
if (DTYPES MATCHES "bf8")
add_definitions(-DCK_ENABLE_BF8)
set(CK_ENABLE_BF8 "ON")
add_compile_options(-Wno-bit-int-extension)
endif()
if (DTYPES MATCHES "fp16")
add_definitions(-DCK_ENABLE_FP16)
......@@ -59,9 +57,11 @@ if (DTYPES)
else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
set(CK_ENABLE_ALL_DTYPES "ON")
add_compile_options(-Wno-bit-int-extension) # enable fp8 and bf8
endif()
#for f8/bf8_t type
add_compile_options(-Wno-bit-int-extension)
if(DL_KERNELS)
add_definitions(-DDL_KERNELS)
set(CK_ENABLE_DL_KERNELS "ON")
......@@ -373,9 +373,10 @@ include_directories(BEFORE
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
add_compile_options(-Werror -Weverything)
endif()
#add flags to reduce the size of binaries
add_compile_options(-Oz -flto=thin)
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
......@@ -390,35 +391,27 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0)
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
#message("fp8 instance found!")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
#message("bf8 instance found!")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
#message("fp16 instance found!")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
#message("fp32 instance found!")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
#message("fp64 instance found!")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
#message("bf16 instance found!")
set(add_inst 1)
endif()
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
#message("int8 instance found!")
set(add_inst 1)
endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES")
#message("instance should be built for all types!")
set(add_inst 1)
endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
......
......@@ -26,29 +26,42 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN amdgpu-install -y --usecase=rocm --no-dkms
## Sccache binary built from source for ROCm
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
# Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
build-essential \
ccache \
cmake \
ccache \
git \
hip-rocclr \
iputils-ping \
jq \
libelf-dev \
libncurses5-dev \
libnuma-dev \
libpthread-stubs0-dev \
llvm-amdgpu \
net-tools \
pkg-config \
python \
python3 \
python3-dev \
python3-pip \
redis \
sshpass \
stunnel \
software-properties-common \
vim \
nano \
zlib1g-dev \
zip \
openssh-server \
clang-format-12 \
kmod && \
......@@ -61,7 +74,7 @@ RUN gunzip /usr/local/bin/ninja.gz
RUN chmod a+x /usr/local/bin/ninja
RUN git clone https://github.com/nico/ninjatracing.git
# Update the cmake to the latest version
RUN pip install --upgrade cmake
RUN pip install --upgrade cmake==3.27.5
# Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
......@@ -76,9 +89,9 @@ ARG PREFIX=/opt/rocm
RUN pip3 install --upgrade pip
RUN pip3 install sqlalchemy==1.4.46
RUN pip3 install pymysql
RUN pip3 install pandas
RUN pip3 install pandas==2.0.3
RUN pip3 install setuptools-rust
RUN pip3 install sshtunnel
RUN pip3 install sshtunnel==0.4.0
# Setup ubsan environment to printstacktrace
ENV UBSAN_OPTIONS=print_stacktrace=1
......@@ -114,6 +127,8 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ];
else echo "using the release compiler"; \
fi
#clean-up the deb package
RUN sh -c "rm -rf amdgpu-install*"
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
......@@ -65,10 +65,10 @@ def getDockerImageName(){
}
def check_host() {
if ("${env.CK_CCACHE}" != "null"){
def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
echo "ccache server: ${CCACHE_SERVER}"
sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
if ("${env.CK_SCCACHE}" != "null"){
def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
echo "sccache server: ${SCCACHE_SERVER}"
sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
def output = readFile(file: "tmp.txt")
echo "tmp.txt contents: \$output"
return (output != "0")
......@@ -96,24 +96,9 @@ def build_compiler(){
def getDockerImage(Map conf=[:]){
env.DOCKER_BUILDKIT=1
def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
def prefixpath = conf.get("prefixpath", "/opt/rocm")
def no_cache = conf.get("no_cache", false)
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
echo "ccache server: ${env.CK_CCACHE}"
if(env.CK_CCACHE)
{
if(check_host())
{
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
}
else
{
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
}
if(no_cache)
{
dockerArgs = dockerArgs + " --no-cache "
......@@ -142,21 +127,6 @@ def buildDocker(install_prefix){
def image_name = getDockerImageName()
echo "Building Docker for ${image_name}"
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
echo "ccache server: ${env.CK_CCACHE}"
if(env.CK_CCACHE)
{
if(check_host())
{
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
}
else
{
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
}
echo "Build Args: ${dockerArgs}"
try{
......@@ -169,7 +139,7 @@ def buildDocker(install_prefix){
else{
echo "Checking for image: ${image_name}"
sh "docker manifest inspect --insecure ${image_name}"
echo "Image: ${image_name} found!! Skipping building image"
echo "Image: ${image_name} found! Skipping building image"
}
}
catch(Exception ex){
......@@ -219,13 +189,9 @@ def cmake_build(Map conf=[:]){
}else{
setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
}
if(env.CK_CCACHE)
{
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
}
echo "ccache server: ${env.CK_CCACHE}"
def pre_setup_cmd = """
#!/bin/bash
echo \$HSA_ENABLE_SDMA
ulimit -c unlimited
rm -rf build
......@@ -234,6 +200,60 @@ def cmake_build(Map conf=[:]){
mkdir install
cd build
"""
def invocation_tag=""
if (setup_args.contains("gfx11")){
invocation_tag="gfx11"
}
if (setup_args.contains("gfx10")){
invocation_tag="gfx10"
}
if (setup_args.contains("gfx90")){
invocation_tag="gfx90"
}
if (setup_args.contains("gfx94")){
invocation_tag="gfx94"
}
echo "invocation tag: ${invocation_tag}"
def redis_pre_setup_cmd = pre_setup_cmd
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
redis_pre_setup_cmd = pre_setup_cmd + """
#!/bin/bash
export ROCM_PATH=/opt/rocm
export SCCACHE_ENABLED=true
export SCCACHE_LOG_LEVEL=debug
export SCCACHE_IDLE_TIMEOUT=14400
export COMPILERS_HASH_DIR=/tmp/.sccache
export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
stunnel ../script/redis-cli.conf
../script/sccache_wrapper.sh --enforce_redis
"""
try {
def cmd1 = conf.get("cmd1", """
${redis_pre_setup_cmd}
""")
sh cmd1
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
}
catch(Exception err){
echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
def cmd2 = conf.get("cmd2", """
${pre_setup_cmd}
""")
sh cmd2
}
}
else{
def cmd3 = conf.get("cmd3", """
${pre_setup_cmd}
""")
sh cmd3
}
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory
def nt = nthreads()
......@@ -241,17 +261,19 @@ def cmake_build(Map conf=[:]){
def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """
${pre_setup_cmd}
${setup_cmd}
${build_cmd}
${execute_cmd}
""")
echo cmd
sh cmd
dir("build"){
sh cmd
}
// Only archive from master or develop
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) {
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
}
}
......@@ -526,6 +548,26 @@ def Build_CK(Map conf=[:]){
stash "ckprofiler_0.2.0_amd64.deb"
}
}
if (params.hipTensor_test && navi_node == 0 ){
//build and test hipTensor
sh """#!/bin/bash
rm -rf "${params.hipTensor_branch}".zip
rm -rf hipTensor-"${params.hipTensor_branch}"
wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
unzip -o "${params.hipTensor_branch}".zip
"""
dir("hipTensor-${params.hipTensor_branch}"){
sh """#!/bin/bash
mkdir -p build
ls -ltr
CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="/opt/rocm;${env.WORKSPACE}/install"
cmake --build build -- -j
"""
}
dir("hipTensor-${params.hipTensor_branch}/build"){
sh 'ctest'
}
}
}
}
}
......@@ -615,7 +657,7 @@ def process_results(Map conf=[:]){
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
pipeline {
agent none
......@@ -654,6 +696,18 @@ pipeline {
name: "DL_KERNELS",
defaultValue: false,
description: "Select whether to build DL kernels (default: OFF)")
booleanParam(
name: "hipTensor_test",
defaultValue: true,
description: "Use the CK build to verify hipTensor build and tests (default: ON)")
string(
name: 'hipTensor_branch',
defaultValue: 'develop',
description: 'Specify which branch of hipTensor to use (default: develop)')
booleanParam(
name: "USE_SCCACHE",
defaultValue: true,
description: "Use the sccache for building CK (default: ON)")
}
environment{
dbuser = "${dbuser}"
......@@ -761,8 +815,8 @@ pipeline {
}
agent{ label rocmnode("navi32") }
environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
......
# Composable Kernel
## Methodology
The Composable Kernel (CK) library provides a programming model for writing performance-critical
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
uses general purpose kernel languages, such as HIP C++.
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
CK uses two concepts to achieve performance portability and code maintainability:
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
* Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
technique called *Tensor Coordinate Transformation*.
![ALT](/docs/data/ck_component.png "CK Components")
## Code Structure
The current CK library is structured into four layers:
Current CK library are structured into 4 layers:
* "Templated Tile Operators" layer
* "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Client API" layer
* Templated Tile Operators
* Templated Kernel and Invoker
* Instantiated Kernel and Invoker
* Client API
![ALT](/docs/data/ck_layer.png "CK Layers")
## Documentation
## General information
Run the steps below to build documentation locally.
To build our documentation locally, use the following code:
```
``` bash
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Contributors
The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
page.
## Citation
```note
If you use CK, cite us as follows:
If you use CK, please use following citations:
* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
This paper will be available on arXiv soon.
* [CITATION.cff](/CITATION.cff)
```
## License
CK is released under the **[MIT license](/LICENSE)**.
CK is released under the MIT license. [License File](/LICENSE)
## Building CK
We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).
# Build CK
1. To build a new Docker image, use the Dockerfile provided with the source code:
## Build docker image
```bash
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
```
```bash
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
```
Pre-built dockers are available from this public repo:
https://hub.docker.com/r/rocm/composable_kernel/tags
2. Launch the Docker container:
## Launch docker
```bash
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
ck:latest \
/bin/bash
```
```bash
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
ck:latest \
/bin/bash
```
3. Clone CK source code from the GitHub repository and start the build:
## Build CK
```bash
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
cd composable_kernel && \
mkdir build && \
cd build
```
```bash
mkdir build && cd build
You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.
# Need to specify target ID, example below is for gfx908 and gfx90a
```bash
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
supported by the current compiler (this may take a long time).
If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the
current compiler.
4. Build the entire CK library:
```bash
make -j
```
Additional cmake flags can be used to significantly speed-up the build:
5. Install CK:
INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
```bash
make -j install
```
DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances
of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
## Optional post-install steps
DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl
instances. Those instances are only needed for the NAVI2x platforms.
* Build examples and tests:
### Build examples and tests
```bash
make -j examples tests
```
```bash
make -j examples tests
make test
```
* Build and run all examples and tests:
```bash
make -j check
```
Instructions for running each individual examples are under [example](/example)
You can find instructions for running each individual example in [example](/example).
* Build ckProfiler:
## Build ckProfiler
```bash
make -j ckProfiler
```
You can find instructions for running ckProfiler in [profiler](/profiler).
Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
Additional cmake flags can be used to significantly speed-up the build:
* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
dependency and don't plan to run any examples or tests.
* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
other data types.
* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
`batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
other platforms have faster instances, such as `xdl` or `wmma`, available.
## Using sccache for building
The default CK Docker images come with a pre-installed version of sccache, which supports clang
being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
hours to 1-2 minutes. In order to invoke sccache, you need to run:
```bash
make -j ckProfiler
sccache --start-server
```
Instructions for running ckProfiler are under [profiler](/profiler)
## Install CK
then add the following flags to the cmake command line:
```bash
make install
-DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
```
You may need to clean up the build folder and repeat the cmake and make steps in order to take
advantage of the sccache during subsequent builds.
## Using CK as pre-built kernel library
Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).
## Contributing
## Contributing to CK
When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
```bash
sudo script/install_precommit.sh
```
This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
With this approach, `pre-commit` adds the appropriate hooks to your local repository and
automatically runs `clang-format` (and possibly additional checks) before any commit is created.
If you need to uninstall hooks from the repository, you can do so by running the following command:
......@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the
script/uninstall_precommit.sh
```
If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
## Caveat
### Kernel Timing and Verification
CK's own kernel timer will warn up kernel once, and then run it multiple times
to get average kernel time. For some kernels that use atomic add, this will cause
output buffer to be accumulated multiple times, causing verification failure.
To work around it, do not use CK's own timer and do verification at the same time.
CK's own timer and verification in each example and ckProfiler can be enabled or
disabled from command line.
If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
`git commit` command.
......@@ -12,12 +12,14 @@
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t;
using YDataType = ck::half_t;
using ComputeDataType = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t;
using YDataType = ck::half_t;
using SaveMeanInvStdDataType = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#define SAVE_MEAN_INV_STD
constexpr int Rank = 2;
constexpr int NumReduceDim = 1;
......@@ -50,12 +52,16 @@ int main(int argc, char* argv[])
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
#ifdef SAVE_MEAN_INV_STD
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
#endif
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType,
BetaDataType,
ComputeDataType,
YDataType,
SaveMeanInvStdDataType,
PassThrough,
Rank,
NumReduceDim>;
......@@ -84,14 +90,21 @@ int main(int argc, char* argv[])
{0, 1}, // gammaStrides
{0, 1}, // betaStrides
{Stride, 1}, // yStrides
{1}, // save_mean Strides
{1}, // save_inv_std Strides
{1}, // reduceDims
1e-4,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
......@@ -109,6 +122,10 @@ int main(int argc, char* argv[])
std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
#ifdef SAVE_MEAN_INV_STD
num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
#endif
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
......@@ -140,17 +157,24 @@ int main(int argc, char* argv[])
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
{Stride, 1}, // xStrides
{1}, // gammaStrides
{1}, // betaStrides
{0, 1}, // gammaStrides
{0, 1}, // betaStrides
{Stride, 1}, // yStrides
{1}, // save_mean Strides
{1}, // save_inv_std Strides
{1}, // reduceDims
1e-4,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
......@@ -12,12 +12,14 @@
#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
using XDataType = ck::half_t;
using GammaDataType = float;
using BetaDataType = float;
using YDataType = ck::half_t;
using ComputeDataType = float;
using Swish = ck::tensor_operation::element_wise::Swish;
using XDataType = ck::half_t;
using GammaDataType = float;
using BetaDataType = float;
using YDataType = ck::half_t;
using SaveMeanInvStdDataType = float;
using Swish = ck::tensor_operation::element_wise::Swish;
#define SAVE_MEAN_INV_STD
constexpr int Rank = 5;
constexpr int NumReduceDim = 3;
......@@ -49,19 +51,24 @@ int main(int argc, char* argv[])
std::size_t xy_size = N * H * W * G * C;
std::size_t gamma_beta_size = G * C;
std::vector<ck::index_t> xy_strides = {H * W * G * C, W * G * C, G * C, C, 1};
std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
std::vector<ck::index_t> xy_strides = {H * W * G * C, W * G * C, G * C, C, 1};
std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
std::vector<ck::index_t> save_mean_inv_std_strides = {G, 1};
SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
#ifdef SAVE_MEAN_INV_STD
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N * G);
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N * G);
#endif
using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType,
BetaDataType,
ComputeDataType,
YDataType,
SaveMeanInvStdDataType,
Swish,
Rank,
NumReduceDim>;
......@@ -75,19 +82,26 @@ int main(int argc, char* argv[])
const auto& generic_op_ptr = op_ptrs[0];
auto generic_argument_ptr =
generic_op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
{1, 2, 4}, // reduceDims
generic_op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
save_mean_inv_std_strides, // save_mean Strides
save_mean_inv_std_strides, // save_inv_std Strides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
Swish{});
if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
......@@ -107,21 +121,29 @@ int main(int argc, char* argv[])
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
nullptr,
nullptr,
Swish{});
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
save_mean_inv_std_strides, // save_mean Strides
save_mean_inv_std_strides, // save_inv_std Strides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
Swish{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
......@@ -139,6 +161,10 @@ int main(int argc, char* argv[])
sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
#ifdef SAVE_MEAN_INV_STD
num_byte += sizeof(SaveMeanInvStdDataType) * N * G * 2;
#endif
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
......@@ -169,20 +195,28 @@ int main(int argc, char* argv[])
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
nullptr,
nullptr,
Swish{});
auto argument_ptr =
op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
save_mean_inv_std_strides, // save_mean Strides
save_mean_inv_std_strides, // save_inv_std Strides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
#ifdef SAVE_MEAN_INV_STD
save_mean_device_buf.GetDeviceBuffer(),
save_inv_std_device_buf.GetDeviceBuffer(),
#else
nullptr,
nullptr,
#endif
Swish{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
......@@ -16,10 +16,10 @@
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::GNHWC;
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
......@@ -52,18 +52,18 @@ int main()
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
// We have NHWGC in memory space
// However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C);
SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);
using namespace ck::conv_tensor_rearrange_op;
......@@ -93,6 +93,7 @@ int main()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
......@@ -112,7 +113,7 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
......@@ -149,6 +150,7 @@ int main()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
......
......@@ -16,10 +16,10 @@
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::GNHWC;
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
......@@ -52,11 +52,11 @@ int main()
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
// We have NHWGC in memory space
// However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
......@@ -64,7 +64,7 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);
using namespace ck::conv_tensor_rearrange_op;
......@@ -93,6 +93,7 @@ int main()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
......@@ -112,7 +113,7 @@ int main()
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
......@@ -149,6 +150,7 @@ int main()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
......
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 64; // batch size
static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Z = 3; // filter D
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Di = 14; // input D
static constexpr ck::index_t Hi = 14; // input H
static constexpr ck::index_t Wi = 14; // input W
static constexpr ck::index_t Do = 14; // output D
static constexpr ck::index_t Ho = 14; // output H
static constexpr ck::index_t Wo = 14; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int execute_conv_fwd_scaleadd_scaleadd_relu()
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
std::array<ck::index_t, 6> in_strides{
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
std::array<ck::index_t, 6> wei_strides{
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
std::array<ck::index_t, 6> out_strides{
C, Do * Ho * Wo * G * C, 1, Ho * Wo * G * C, Wo * G * C, G * C};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout, OutLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<std::tuple_element_t<0, DDataTypes>, std::tuple_element_t<1, DDataTypes>>,
OutDataType,
PassThrough,
PassThrough,
ScaleAddScaleAddRelu>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths, out_lengths},
{out_strides, out_strides},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ScaleAddScaleAddRelu{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop =
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 2 * N * Ho * Wo * G * K;
std::size_t num_bytes =
sizeof(InDataType) * N * Hi * Wi * G * C + sizeof(WeiDataType) * G * K * Y * X * C +
(sizeof(OutDataType) + sizeof(std::tuple_element_t<0, DDataTypes>) +
sizeof(std::tuple_element_t<1, DDataTypes>)) *
N * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr =
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths, out_lengths},
{out_strides, out_strides},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ScaleAddScaleAddRelu{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::bhalf_t;
using WeiDataType = ck::bhalf_t;
using OutDataType = ck::bhalf_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<ck::bhalf_t, ck::bhalf_t>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<ck::half_t, ck::half_t>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = float;
using WeiDataType = float;
using OutDataType = float;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<float, float>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using OutDataType = int8_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<float, float>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
......@@ -70,6 +70,7 @@ else()
-Wno-option-ignored
-Wsign-compare
-Wno-extra-semi-stmt
-Wno-unused-template
)
if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
list(APPEND CMAKE_COMPILER_WARNINGS
......
rocm-docs-core>=0.20.0
sphinxcontrib-bibtex==2.5.0
sphinxcontrib-bibtex==2.6.1
......@@ -103,7 +103,7 @@ requests==2.28.2
# via
# pygithub
# sphinx
rocm-docs-core==0.24.0
rocm-docs-core==0.26.0
# via -r requirements.in
six==1.16.0
# via
......@@ -139,7 +139,7 @@ sphinx-notfound-page==0.8.3
# via rocm-docs-core
sphinxcontrib-applehelp==1.0.4
# via sphinx
sphinxcontrib-bibtex==2.5.0
sphinxcontrib-bibtex==2.6.1
# via -r requirements.in
sphinxcontrib-devhelp==1.0.2
# via sphinx
......
add_custom_target(example_gemm_dl)
add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_dl example_gemm_dl_fp32)
endif()
add_example_dependencies(example_gemm_dl example_gemm_dl_fp32)
add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_dl example_gemm_dl_fp16)
endif()
add_example_dependencies(example_gemm_dl example_gemm_dl_fp16)
add_example_executable(example_gemm_dpp_fp16 gemm_dpp_fp16.cpp)
add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_dl example_gemm_dl_int8)
endif()
add_example_dependencies(example_gemm_dl example_gemm_dl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
add_dependencies(example_gemm_dl example_gemm_dl_int4)
add_example_dependencies(example_gemm_dl example_gemm_dl_int4)
endif(USE_BITINT_EXTENSION_INT4)
add_custom_target(example_gemm_xdl)
add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
add_custom_target(example_gemm_wmma)
add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
endif()
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
endif()
add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn)
endif()
add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_int8)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_int4)
add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_int4)
endif(USE_BITINT_EXTENSION_INT4)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
endif()
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(example_gemm_add_add_fastgelu_xdl)
add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
if(gpu IN_LIST gpu_list AND target EQUAL 0)
add_custom_target(example_gemm_add_add_fastgelu_xdl)
add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
endif(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
set(target 1)
endif()
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
endif()
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
endif()
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
endif(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
if(result EQUAL 0)
add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
endif()
set(target 1)
endif()
endforeach()
\ No newline at end of file
endforeach()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment