Merge branch 'develop' into ck_tile/gemm_debug_alias

8d2f2f8c · coderfeli · 99c8123f · 4cb3d7d7 · 8d2f2f8c · 8d2f2f8c
Commit 8d2f2f8c authored Dec 05, 2024 by coderfeli
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,13 +185,22 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
    add_definitions(-DCK_USE_XDL)
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-    message("Enabling FP8 gemms in ckProfiler")
+    message("Enabling FP8 gemms on native architectures")
    add_definitions(-DCK_USE_GFX94)
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
    message("Enabling WMMA instances")
    add_definitions(-DCK_USE_WMMA)
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    add_definitions(-DCK_USE_OCP_FP8)
+    set(CK_USE_OCP_FP8 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    add_definitions(-DCK_USE_FNUZ_FP8)
+    set(CK_USE_FNUZ_FP8 "ON")
+endif()
+
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
 if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
    add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)

--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
+[Back to the main page](./README.md)
 # Composable Kernel Developers and Contributors

 This is the list of developers and contributors to Composable Kernel library

--- a/README.md
+++ b/README.md
@@ -26,23 +26,15 @@ The current CK library is structured into four layers:

 ## General information

-To build our documentation locally, use the following code:
-
-``` bash
-cd docs
-pip3 install -r sphinx/requirements.txt
-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
-```
-
-You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
-
-```note
-If you use CK, cite us as follows:
-
-* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
-  This paper will be available on arXiv soon.
-* [CITATION.cff](/CITATION.cff)
-```
+* [CK supported operations](include/ck/README.md)
+* [CK Tile supported operations](include/ck_tile/README.md)
+* [CK wrapper](client_example/25_wrapper/README.md)
+* [CK codegen](codegen/README.md)
+* [CK profiler](profiler/README.md)
+* [Examples (Custom use of CK supported operations)](example/README.md)
+* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
+* [Terminology](/TERMINOLOGY.md)
+* [Contributors](/CONTRIBUTORS.md)

 CK is released under the **[MIT license](/LICENSE)**.

@@ -137,6 +129,14 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa

    You can find instructions for running ckProfiler in [profiler](/profiler).

+* Build our documentation locally:
+
+    ``` bash
+    cd docs
+    pip3 install -r sphinx/requirements.txt
+    python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+    ```
+
 Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
 However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
 crash. On average, you should expect each thread to use ~2Gb of RAM.

--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
+[Back to the main page](./README.md)
+# Composable Kernel terminology
\ No newline at end of file
--- a/client_example/25_wrapper/README.md
+++ b/client_example/25_wrapper/README.md
+[Back to the main page](../../README.md)
 # Composable Kernel wrapper GEMM tutorial

-This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
-wrapper. We present the base version of GEMM without most of the available optimizations; however,
-it's worth noting that CK has kernels with different optimizations.
+This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK) wrapper. We present the base version of GEMM without most of the available optimizations; however, it's worth noting that CK has kernels with different optimizations.

-To implement these optimizations, you can use the CK wrapper or directly use available instances in
-CK. You can also refer to the
-[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
-that uses CK wrapper based on the
-[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
+To implement these optimizations, you can use the CK wrapper or directly use available instances in CK. You can also refer to the [optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp), that uses CK wrapper based on the [`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.

 The kernel definition should look similar to:


--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -56,6 +56,14 @@ if (GPU_TARGETS)
        add_definitions(-DCK_USE_WMMA)
        set(CK_USE_WMMA "ON")
    endif()
+    if (GPU_TARGETS MATCHES "gfx12")
+        add_definitions(-DCK_USE_OCP_FP8)
+        set(CK_USE_OCP_FP8 "ON")
+    endif()
+    if (GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94")
+        add_definitions(-DCK_USE_FNUZ_FP8)
+        set(CK_USE_FNUZ_FP8 "ON")
+    endif()
 else()
    add_definitions(-DCK_USE_WMMA -DCK_USE_XDL)
    set(CK_USE_XDL "ON")

--- a/client_example/README.md
+++ b/client_example/README.md
+[Back to the main page](../README.md)
+# Composable Kernel client examples
 ##
 Client application links to CK library, and therefore CK library needs to be installed before building client applications.


--- a/codegen/README.md
+++ b/codegen/README.md
+[Back to the main page](../README.md)
+# Composable Kernel codegen
\ No newline at end of file
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.9.2
+rocm-docs-core==1.10.0
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.9.2
+rocm-docs-core==1.10.0
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -76,7 +76,7 @@ struct ProblemSizeSplitK final
 struct ExecutionConfig final
 {
    // 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
-    int do_verification = 3;
+    int do_verification = 1;
    int init_method     = 2;
    bool time_kernel    = false;
 };

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -143,8 +143,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
    switch(config.init_method)
    {
    case 0:
-        ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
-        ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.f)}(a_m_k);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(1.f)}(b_k_n);
        break;
    case 1:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);

--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
@@ -186,15 +186,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            for(int j = 0; j < NumDMatrices; ++j)
            {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
            }
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
            for(int j = 0; j < NumDMatrices; ++j)
            {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<DDataType, 0>{});
            }
        }
    }

--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -190,15 +190,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            for(int j = 0; j < NumDs; ++j)
            {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
            }
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
            for(int j = 0; j < NumDs; ++j)
            {
-                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+                d_tensors[i][j].GenerateTensorValue(GeneratorTensor_Sequential<DDataType, 0>{});
            }
        }
    }

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -167,11 +167,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }

-        d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        d0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<D0DataType, 1>{});
    }

    using GroupedGemmKernelArgument = ck::tensor_operation::device::GroupedGemmKernelArgument<1>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }
    }


--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -158,8 +158,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }
    }


--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once

 struct ProblemSize final
@@ -124,8 +127,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
            break;
        default:
-            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
        }
    }


--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
        break;
    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
    }

    c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});

--- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -150,7 +150,7 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
        break;
    default:
        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
    }