Unverified Commit 984889fb authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge pull request #13 from ROCmSoftwarePlatform/merge_from-_public_repo

Merge from  public repo
parents a73ab0d8 4a106f7d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space
// However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);
using namespace ck::conv_tensor_rearrange_op;
using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
ImageLayout,
InDataType,
OutDataType,
ImageToColumn>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
image_strides,
gemm_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(avg_time < best_avg_time)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
image_strides,
gemm_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
}
......@@ -2,7 +2,53 @@ cmake_minimum_required(VERSION 3.15)
project(ck_app)
add_compile_options(-std=c++17)
find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
if (DTYPES)
add_definitions(-DDTYPES)
if (DTYPES MATCHES "int8")
add_definitions(-DCK_ENABLE_INT8)
if(NOT DEFINED ${CK_ENABLE_INT8})
set(CK_ENABLE_INT8 "ON")
endif()
endif()
if (DTYPES MATCHES "fp8")
add_definitions(-DCK_ENABLE_FP8)
if(NOT DEFINED ${CK_ENABLE_FP8})
set(CK_ENABLE_FP8 "ON")
endif()
endif()
if (DTYPES MATCHES "fp16")
add_definitions(-DCK_ENABLE_FP16)
if(NOT DEFINED ${CK_ENABLE_FP16})
set(CK_ENABLE_FP16 "ON")
endif()
endif()
if (DTYPES MATCHES "fp32")
add_definitions(-DCK_ENABLE_FP32)
if(NOT DEFINED ${CK_ENABLE_FP32})
set(CK_ENABLE_FP32 "ON")
endif()
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-DCK_ENABLE_FP64)
if(NOT DEFINED ${CK_ENABLE_FP64})
set(CK_ENABLE_FP64 "ON")
endif()
endif()
if (DTYPES MATCHES "bf16")
add_definitions(-DCK_ENABLE_BF16)
if(NOT DEFINED ${CK_ENABLE_BF16})
set(CK_ENABLE_BF16 "ON")
endif()
endif()
message("DTYPES macro set to ${DTYPES}")
else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
if(NOT DEFINED ${CK_ENABLE_ALL_DTYPES})
set(CK_ENABLE_ALL_DTYPES "ON")
endif()
endif()
find_package(composable_kernel COMPONENTS device_operations)
find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}")
......
......@@ -309,6 +309,8 @@ XML_OUTPUT
XML_PROGRAMLISTING
)
set(WARN_AS_ERROR YES)
set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
function(add_doxygen_doc)
......
......@@ -67,8 +67,10 @@ else()
-Wunused
-Wno-reserved-identifier
-Werror
-Wno-option-ignored
-Wsign-compare
-Wno-extra-semi-stmt
-Wno-unused-template
)
if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
list(APPEND CMAKE_COMPILER_WARNINGS
......@@ -92,6 +94,7 @@ else()
-Wno-unused-command-line-argument
-Wno-weak-vtables
-Wno-covered-switch-default
-Wno-unsafe-buffer-usage
)
else()
if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
......
rocm-docs-core==0.2.0
sphinxcontrib-bibtex==2.5.0
......@@ -7,8 +7,8 @@ API Reference Guide
Introduction
=================
This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
principles that are used to write new classes that extend CK functionality.
This document contains details of the APIs for the Composable Kernel (CK) library and introduces
some of the key design principles that are used to write new classes that extend CK functionality.
=================
Using CK API
......@@ -30,8 +30,8 @@ DeviceMem
Kernels For Flashattention
---------------------------
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists the classes that are
used in the CK GPU implementation of Flashattention.
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists
the classes that are used in the CK GPU implementation of Flashattention.
**Gridwise classes**
......
......@@ -2,7 +2,101 @@
Contributor's Guide
===================
Pull-request guidelines
=======================
This chapter explains how to get started contributing to the Composable Kernel project and what are
the contributing rules.
[TODO]
Getting started
===============
#. **Documentation:** Before contributing to the library, familiarize yourself with the
`Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
It provides insight into the core concepts, environment configuration, and steps to obtain or
build the library. You can also find some of this information in the
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
on the project's GitHub page.
#. **Additional reading:** We also recommend reading a `blog post
<https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
from the AMD Community portal. It offers a deeper understanding of the library's objectives and
showcases its performance capabilities.
#. **General information:** For broader information about AMD products, consider exploring the
`AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
How do I contribute
===================
We deeply value contributions from our users. You can make an impact by reporting issues or
proposing code enhancements through pull requests.
Reporting issues
----------------
We use `Github issues <https://github.com/ROCmSoftwarePlatform/composable_kernel/issues>`_
to track public bugs and enhancement requests.
If you encounter an issue with the library, please check if the problem has already been
reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
issue. All reported issues must include:
* A comprehensive description of the problem, including:
* What did you observe?
* Why do you think it is a bug (if it seems like one)?
* What did you expect to happen? What would indicate the resolution of the problem?
* Are there any known workarounds?
* Your configuration details, including:
* Which GPU are you using?
* Which OS version are you on?
* Which ROCm version are you using?
* Are you using a Docker image? If so, which one?
* Steps to reproduce the issue, including:
* What actions trigger the issue? What are the reproduction steps?
* If you build the library from scratch, what CMake command did you use?
* How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
Before sumbitting any issue, ensure you have addressed all relevant questions from the checklist.
Creating Pull Requests
----------------------
You can submit `Pull Requests (PR) on GitHub
<https://github.com/ROCmSoftwarePlatform/composable_kernel/pulls>`_.
All contributors are required to develop their changes on a separate branch and then create a
pull requrest to merge their changes into the `develop` branch, which is the default
development branch in the Composable Kernel project. All external contributors must use their own
forks of the project to develop their changes.
When submitting a Pull Request you should:
* Describe the change providing information about the motivation for the change and a general
description of all code modifications.
* Verify and test the change:
* Run any relevant existing tests.
* Write new tests if added functionality is not covered by current tests.
* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
highly recommend contributors utilize this method to maintain consistent code formatting.
Instructions on setting up `pre-commit` can be found in the project's
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
* Link your PR to any related issues:
* If there is an issue that is resolved by your change, please provide a link to the issue in
the description of your pull request.
* For larger contributions, structure your change into a sequence of smaller, focused commits, each
addressing a particular aspect or fix.
Following the above guidelines ensures a seamless review process and faster assistance from our
end.
Thank you for your commitment to enhancing the Composable Kernel project! We look forward to collaborating with you.
......@@ -2,15 +2,16 @@
Supported Primitives Guide
==========================
This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference
Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
This document contains details of supported primitives in Composable Kernel (CK). In contrast to the
API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins
the algorithms implemented in CK.
------------
Softmax
------------
For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the softmax of concatenated
:math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the
softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
.. math::
:nowrap:
......@@ -25,8 +26,8 @@ For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can d
where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
:math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.
For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size :math:`B_r \times B_c` we can
compute the row-wise softmax as follows.
For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
:math:`B_r \times B_c` we can compute the row-wise softmax as follows.
For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,
......
......@@ -4,10 +4,21 @@
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import subprocess
from rocm_docs import ROCmDocs
docs_core = ROCmDocs("Composable Kernel Documentation")
docs_core.run_doxygen()
name = "Composable Kernel"
get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
version = subprocess.getoutput(get_version)
if len(version) > 0:
name = f"{name} {version}"
external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(f"{name} Documentation")
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
docs_core.setup()
mathjax3_config = {
......
===================
CK docker hub
CK Docker Hub
===================
`Docker hub <https://hub.docker.com/r/rocm/composable_kernel>`_
-------------------------------------
Why do I need this?
-------------------------------------
To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
To make our lives easier and bring Composable Kernel dependencies together, we recommend using
docker images that can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel>`_.
-------------------------------------
So what is Composable Kernel?
-------------------------------------
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
Composable Kernel (CK) library aims to provide a programming model for writing performance critical
kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
through general purpose kernel languages, like HIP C++.
To get the CK library::
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
run a docker container::
docker run \
......@@ -30,7 +30,7 @@ run a docker container::
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm5.3_release \
rocm/composable_kernel:ck_ub20.04_rocm5.6 \
/bin/bash
and build the CK::
......@@ -58,7 +58,9 @@ We can also run specific examples or tests like::
./bin/example_gemm_xdl_fp16
./bin/test_gemm_fp16
For more details visit `CK github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_, `CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_, `even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
For more details visit `CK github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_,
`CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_,
`even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
-------------------------------------
And what is inside?
......@@ -74,12 +76,11 @@ The docker images have everything you need for running CK including:
Which image is right for me?
-------------------------------------
Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
Let's take a look at the image naming, for example ``ck_ub20.04_rocm5.6``. The image specs are:
* "ck" - made for running Composable Kernel
* "ub20.04" - based on Ubuntu 20.04
* "rocm5.4" - ROCm platform version 5.4
* "release" - compiler version is release
* ``ck`` - made for running Composable Kernel;
* ``ub20.04`` - based on Ubuntu 20.04;
* ``rocm5.6`` - ROCm platform version 5.6.
So just pick the right image for your project dependencies and you're all set.
......@@ -87,7 +88,9 @@ So just pick the right image for your project dependencies and you're all set.
DIY starts here
-------------------------------------
If you need to customize a docker image or just can't stop tinkering, feel free to adjust the `Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_ for your needs.
If you need to customize a docker image or just can't stop tinkering, feel free to adjust the
`Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_
for your needs.
-------------------------------------
License
......
......@@ -12,12 +12,15 @@ This document contains instructions for installing, using, and contributing to C
Methodology
-----------
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
Composable Kernel (CK) library aims to provide a programming model for writing performance critical
kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
through general purpose kernel languages, like HIP C++.
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
* Algorithm complexity reduction for complex ML operators, using innovative technique we call
"Tensor Coordinate Transformation".
.. image:: data/ck_component.png
:alt: CK Components
......
=======
License
=======
.. include:: ../LICENSE
:literal:
# Anywhere {branch} is used, the branch name will be substituted.
# These comments will also be removed.
defaults:
numbered: False
maxdepth: 6
root: index
subtrees:
- caption: About
entries:
- file: license
rocm-docs-core>=0.20.0
sphinxcontrib-bibtex==2.6.1
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile .sphinx/requirements.in
# pip-compile requirements.in
#
accessible-pygments==0.0.3
# via pydata-sphinx-theme
alabaster==0.7.13
# via sphinx
asttokens==2.2.1
# via stack-data
attrs==22.2.0
# via
# jsonschema
# jupyter-cache
babel==2.12.1
# via
# pydata-sphinx-theme
# sphinx
backcall==0.2.0
# via ipython
beautifulsoup4==4.11.2
# via pydata-sphinx-theme
breathe==4.34.0
......@@ -27,19 +19,15 @@ breathe==4.34.0
certifi==2022.12.7
# via requests
cffi==1.15.1
# via pynacl
# via
# cryptography
# pynacl
charset-normalizer==3.1.0
# via requests
click==8.1.3
# via
# jupyter-cache
# sphinx-external-toc
comm==0.1.2
# via ipykernel
debugpy==1.6.6
# via ipykernel
decorator==5.1.1
# via ipython
# via sphinx-external-toc
cryptography==40.0.2
# via pyjwt
deprecated==1.2.13
# via pygithub
docutils==0.16
......@@ -48,110 +36,40 @@ docutils==0.16
# myst-parser
# pybtex-docutils
# pydata-sphinx-theme
# rocm-docs-core
# sphinx
# sphinxcontrib-bibtex
executing==1.2.0
# via stack-data
fastjsonschema==2.16.3
# via nbformat
fastjsonschema==2.18.0
# via rocm-docs-core
gitdb==4.0.10
# via gitpython
gitpython==3.1.31
gitpython==3.1.35
# via rocm-docs-core
greenlet==2.0.2
# via sqlalchemy
idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.0.0
# via
# jupyter-cache
# myst-nb
ipykernel==6.21.3
# via myst-nb
ipython==8.11.0
# via
# ipykernel
# myst-nb
jedi==0.18.2
# via ipython
jinja2==3.1.2
# via
# myst-parser
# sphinx
jsonschema==4.17.3
# via nbformat
jupyter-cache==0.5.0
# via myst-nb
jupyter-client==8.0.3
# via
# ipykernel
# nbclient
jupyter-core==5.3.0
# via
# ipykernel
# jupyter-client
# nbformat
latexcodec==2.0.1
# via pybtex
linkify-it-py==1.0.3
# via myst-parser
markdown-it-py==2.2.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==2.1.2
# via jinja2
matplotlib-inline==0.1.6
# via
# ipykernel
# ipython
mdit-py-plugins==0.3.5
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==0.17.1
myst-parser==1.0.0
# via rocm-docs-core
myst-parser[linkify]==0.18.1
# via
# myst-nb
# rocm-docs-core
nbclient==0.5.13
# via
# jupyter-cache
# myst-nb
nbformat==5.7.3
# via
# jupyter-cache
# myst-nb
# nbclient
nest-asyncio==1.5.6
# via
# ipykernel
# nbclient
packaging==23.0
# via
# ipykernel
# pydata-sphinx-theme
# sphinx
parso==0.8.3
# via jedi
pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
platformdirs==3.1.1
# via jupyter-core
prompt-toolkit==3.0.38
# via ipython
psutil==5.9.4
# via ipykernel
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pybtex==0.24.0
# via
# pybtex-docutils
......@@ -160,57 +78,46 @@ pybtex-docutils==1.0.2
# via sphinxcontrib-bibtex
pycparser==2.21
# via cffi
pydata-sphinx-theme==0.13.1
# via sphinx-book-theme
pygithub==1.57
pydata-sphinx-theme==0.13.3
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==1.58.2
# via rocm-docs-core
pygments==2.14.0
# via
# accessible-pygments
# ipython
# pydata-sphinx-theme
# sphinx
pyjwt==2.6.0
pyjwt[crypto]==2.6.0
# via pygithub
pynacl==1.5.0
# via pygithub
pyrsistent==0.19.3
# via jsonschema
python-dateutil==2.8.2
# via jupyter-client
pyyaml==6.0
# via
# jupyter-cache
# myst-nb
# myst-parser
# pybtex
# rocm-docs-core
# sphinx-external-toc
pyzmq==25.0.1
# via
# ipykernel
# jupyter-client
requests==2.28.2
# via
# pygithub
# sphinx
rocm-docs-core==0.2.0
# via -r .sphinx/requirements.in
rocm-docs-core==0.24.0
# via -r requirements.in
six==1.16.0
# via
# asttokens
# latexcodec
# pybtex
# python-dateutil
smmap==5.0.0
# via gitdb
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.4
# via beautifulsoup4
sphinx==4.3.1
sphinx==5.3.0
# via
# breathe
# myst-nb
# myst-parser
# pydata-sphinx-theme
# rocm-docs-core
......@@ -220,7 +127,7 @@ sphinx==4.3.1
# sphinx-external-toc
# sphinx-notfound-page
# sphinxcontrib-bibtex
sphinx-book-theme==1.0.0rc2
sphinx-book-theme==1.0.1
# via rocm-docs-core
sphinx-copybutton==0.5.1
# via rocm-docs-core
......@@ -232,8 +139,8 @@ sphinx-notfound-page==0.8.3
# via rocm-docs-core
sphinxcontrib-applehelp==1.0.4
# via sphinx
sphinxcontrib-bibtex==2.5.0
# via -r .sphinx/requirements.in
sphinxcontrib-bibtex==2.6.1
# via -r requirements.in
sphinxcontrib-devhelp==1.0.2
# via sphinx
sphinxcontrib-htmlhelp==2.0.1
......@@ -244,40 +151,9 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
sqlalchemy==1.4.46
# via jupyter-cache
stack-data==0.6.2
# via ipython
tabulate==0.9.0
# via jupyter-cache
tornado==6.2
# via
# ipykernel
# jupyter-client
traitlets==5.9.0
# via
# comm
# ipykernel
# ipython
# jupyter-client
# jupyter-core
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.5.0
# via
# myst-nb
# myst-parser
uc-micro-py==1.0.1
# via linkify-it-py
# via pydata-sphinx-theme
urllib3==1.26.15
# via requests
wcwidth==0.2.6
# via prompt-toolkit
wrapt==1.15.0
# via deprecated
zipp==3.15.0
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools
......@@ -6,15 +6,26 @@ CK Hello world
Motivation
-------------------------------------
This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever.
This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who
would like to optimize their pipelines and squeeze every performance drop by adding Composable
Kernel (CK) library to their projects. We would like to make the CK library approachable so
the tutorial is not based on the latest release and doesn't have all the bleeding edge features,
but it will be reproducible now and forever.
During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project.
During this tutorial we will have an introduction to the CK library, we will build it and run some
examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go
in depth and breadth and get familiar with other tools and ways to integrate CK into your project.
-------------------------------------
Description
-------------------------------------
Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more.
Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and
efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast
and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create
new ones. The library has components required for majority of modern neural networks architectures
including matrix multiplication, convolution, contraction, reduction, attention modules, variety of
activation functions, fused operators and many more.
So how do we (almost) reach the speed of light? CK acceleration abilities are based on:
......@@ -24,15 +35,18 @@ So how do we (almost) reach the speed of light? CK acceleration abilities are ba
* Hardware acceleration use.
* Support of low precision data types including fp16, bf16, int8 and int4.
If you are excited and need more technical details and benchmarking results - read this awesome `blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
If you are excited and need more technical details and benchmarking results - read this awesome
`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
For more details visit our `github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.
For more details visit our `github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.
-------------------------------------
Hardware targets
-------------------------------------
CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture
CK library fully supports `gfx908` and `gfx90a` GPU architectures and only some operators are
supported for `gfx1030`. Let's check the hardware you have at hand and decide on the target
GPU architecture.
========== =========
GPU Target AMD GPU
......@@ -42,7 +56,8 @@ gfx90a Radeon Instinct MI210, MI250, MI250X
gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
========== =========
There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if you don't have an AMD GPU at hand.
There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
you don't have an AMD GPU at hand.
-------------------------------------
Build the library
......@@ -54,9 +69,13 @@ First let's clone the library and rebase to the tested version::
cd composable_kernel/
git checkout tutorial_hello_world
To make our lives easier we prepared `docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version.
To make our lives easier we prepared
`docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary
dependencies. Pick the right image and create a container. In this tutorial we use
``rocm/composable_kernel:ck_ub20.04_rocm5.6`` image, it is based on Ubuntu 20.04 and
ROCm v5.6.
If your current folder is ${HOME}, start the docker container with::
If your current folder is ``${HOME}``, start the docker container with::
docker run \
-it \
......@@ -64,20 +83,23 @@ If your current folder is ${HOME}, start the docker container with::
--group-add sudo \
-w /root/workspace \
-v ${HOME}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm5.3_release \
rocm/composable_kernel:ck_ub20.04_rocm5.6 \
/bin/bash
If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure.
If your current folder is different from ``${HOME}``, adjust the line ``-v ${HOME}:/root/workspace``
to fit your folder structure.
Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library::
Inside the docker container current folder is ``~/workspace``, library path is
``~/workspace/composable_kernel``, navigate to the library::
cd composable_kernel/
Create and go to the "build" directory::
Create and go to the ``build`` directory::
mkdir build && cd build
In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag::
In the previous section we talked about target GPU architecture. Once you decide which one is right
for you, run CMake using the right ``GPU_TARGETS`` flag::
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
......@@ -87,7 +109,7 @@ In the previous section we talked about target GPU architecture. Once you decide
-D BUILD_DEV=OFF \
-D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
If everything went well the cmake run will end up with::
If everything went well the CMake run will end up with::
-- Configuring done
-- Generating done
......@@ -118,9 +140,12 @@ We can also run them separately, here is a separate example execution::
./bin/example_gemm_xdl_fp16 1 1 1
The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
The arguments ``1 1 1`` mean that we want to run this example in the mode: verify results with CPU,
initialize matrices with integers and benchmark the kernel execution. You can play around with
these parameters and see how output and execution results change.
If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like::
If everything goes well and you have a device based on `gfx908` or `gfx90a` architecture you should see
something like::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
......@@ -130,14 +155,15 @@ If everything goes well and you have a device based on gfx908 or gfx90a architec
Start running 10 times...
Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1
Meanwhile, running it on a gfx1030 device should result in::
Meanwhile, running it on a `gfx1030` device should result in::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like::
But don't panic, some of the operators are supported on `gfx1030` architecture, so you can run a
separate example like::
./bin/example_gemm_dl_fp16 1 1 1
......@@ -154,7 +180,14 @@ and it should result in something nice similar to::
Start running 10 times...
Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
Or we can run a separate test::
.. note::
There was a new CMake flag ``DL_KERNELS`` added in the latest versions of CK. If you use one of
the newest versions of the library and do not see the above results when running
``example_gemm_dl_fp16``, it might be necessary to add ``-D DL_KERNELS=ON`` to your CMake command
in order to build the operators supported on the `gfx1030` architecture.
We can also run a separate test::
ctest -R test_gemm_fp16
......@@ -169,6 +202,9 @@ If everything goes well you should see something like::
Summary
-----------
In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task.
In this tutorial we took the first look at the Composable Kernel library, built it on your system
and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different
configs to find out the best one for your hardware and task.
P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure!
P.S.: Don't forget to switch off the cloud instance if you have launched one, you can find better
ways to spend your money for sure!
add_custom_target(example_gemm_dl)
add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
add_example_dependencies(example_gemm_dl example_gemm_dl_fp32)
add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_example_dependencies(example_gemm_dl example_gemm_dl_fp16)
add_dependencies(example_gemm_dl example_gemm_dl_fp32)
add_dependencies(example_gemm_dl example_gemm_dl_fp16)
add_dependencies(example_gemm_dl example_gemm_dl_int8)
add_example_executable(example_gemm_dpp_fp16 gemm_dpp_fp16.cpp)
add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_example_dependencies(example_gemm_dl example_gemm_dl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
add_dependencies(example_gemm_dl example_gemm_dl_int4)
add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
add_example_dependencies(example_gemm_dl example_gemm_dl_int4)
endif(USE_BITINT_EXTENSION_INT4)
add_custom_target(example_gemm_xdl)
add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
add_custom_target(example_gemm_wmma)
add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
endif()
add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
add_dependencies(example_gemm_xdl example_gemm_xdl_int8)
add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_int8)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_int4)
add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_int4)
endif(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
add_custom_target(example_gemm_wmma)
add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
endif()
add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -33,6 +33,19 @@ struct ProblemSize final
ck::index_t StrideC = 4096;
};
struct ProblemSizeStreamK final
{
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t NumSKBlocks = -1;
};
struct ExecutionConfig final
{
bool do_verification = true;
......@@ -48,8 +61,17 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
inline bool
parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
template <typename ProblemType>
bool parse_cmd_args(int, char*[], ProblemType&, ExecutionConfig&)
{
return false;
}
template <>
bool parse_cmd_args<ProblemSize>(int argc,
char* argv[],
ProblemSize& problem_size,
ExecutionConfig& config)
{
if(argc == 1)
{
......@@ -87,3 +109,52 @@ parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfi
return true;
}
template <>
bool parse_cmd_args<ProblemSizeStreamK>(int argc,
char* argv[],
ProblemSizeStreamK& problem_size,
ExecutionConfig& config)
{
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc >= 10)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
problem_size.M = std::stoi(argv[4]);
problem_size.N = std::stoi(argv[5]);
problem_size.K = std::stoi(argv[6]);
problem_size.StrideA = std::stoi(argv[7]);
problem_size.StrideB = std::stoi(argv[8]);
problem_size.StrideC = std::stoi(argv[9]);
if(argc >= 11)
{
problem_size.NumSKBlocks = std::stoi(argv[10]);
}
}
else
{
std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: NumSKBlocks(optional)" << std::endl;
return false;
}
return true;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment