Commit 5029a5a4 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'develop' into amd-develop

parents 5ec6a912 95907384
...@@ -23,11 +23,11 @@ RUN if [ "$ROCMVERSION" != "6.2" ]; then \ ...@@ -23,11 +23,11 @@ RUN if [ "$ROCMVERSION" != "6.2" ]; then \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc2" ]; then \ elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.1-20.04-1_all.deb --no-check-certificate" && \ sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.1-20.04-1_all.deb && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.1 rel-48 > /etc/apt/sources.list.d/rocm-build.list' && \ sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-8 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=1736298; \ amdgpu-repo --amdgpu-build=1794148; \
fi fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
......
...@@ -886,10 +886,10 @@ pipeline { ...@@ -886,10 +886,10 @@ pipeline {
} }
agent{ label rocmnode("gfx90a") } agent{ label rocmnode("gfx90a") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1100;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-DGPU_TARGETS="gfx908;gfx90a" \ -DGPU_TARGETS="gfx1100;gfx90a" \
-DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
} }
......
...@@ -7,19 +7,23 @@ ...@@ -7,19 +7,23 @@
#include <initializer_list> #include <initializer_list>
#include <vector> #include <vector>
#include "ck/utility/common_header.hpp"
// __gfx9__ defined in the above header via ck.hpp
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/fill.hpp" #include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/wrapper/layout.hpp" #include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp" #include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp" #include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp" #include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp" #include "ck/wrapper/utils/kernel_utils.hpp"
#include "ck/host_utility/device_prop.hpp"
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -204,6 +208,14 @@ void PerformGemm(const ck::index_t M, ...@@ -204,6 +208,14 @@ void PerformGemm(const ck::index_t M,
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool is_supported = ck::is_xdl_supported();
if(!is_supported)
{
std::cout << "WARNING: xdl example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
using DataType = ck::half_t; using DataType = ck::half_t;
const auto thread_layout = const auto thread_layout =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<64>{}, ck::Number<4>{}), ck::wrapper::make_layout(ck::make_tuple(ck::Number<64>{}, ck::Number<4>{}),
...@@ -213,3 +225,4 @@ int main(int argc, char* argv[]) ...@@ -213,3 +225,4 @@ int main(int argc, char* argv[])
3840, 4096, 4096, tile_shape, thread_layout); 3840, 4096, 4096, tile_shape, thread_layout);
return 0; return 0;
} }
#endif
...@@ -7,18 +7,21 @@ ...@@ -7,18 +7,21 @@
#include <initializer_list> #include <initializer_list>
#include <vector> #include <vector>
#include "ck/library/utility/host_tensor.hpp" #include "ck/utility/common_header.hpp"
// __gfx9__ defined in the above header via ck.hpp
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/fill.hpp" #include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/wrapper/layout.hpp" #include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp" #include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp" #include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp" #include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp" #include "ck/wrapper/utils/kernel_utils.hpp"
#include "ck/host_utility/device_prop.hpp"
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -296,6 +299,14 @@ void PerformGemm(const ck::index_t M, ...@@ -296,6 +299,14 @@ void PerformGemm(const ck::index_t M,
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool is_supported = ck::is_xdl_supported();
if(!is_supported)
{
std::cout << "WARNING: xdl example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
using DataType = ck::half_t; using DataType = ck::half_t;
const auto thread_layout = const auto thread_layout =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}), ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
...@@ -305,3 +316,4 @@ int main(int argc, char* argv[]) ...@@ -305,3 +316,4 @@ int main(int argc, char* argv[])
3840, 4096, 4096, tile_shape, thread_layout); 3840, 4096, 4096, tile_shape, thread_layout);
return 0; return 0;
} }
#endif
rocm-docs-core==1.4.0 rocm-docs-core==1.4.1
sphinxcontrib-bibtex==2.6.2 sphinxcontrib-bibtex==2.6.2
...@@ -103,7 +103,7 @@ requests==2.31.0 ...@@ -103,7 +103,7 @@ requests==2.31.0
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==1.4.0 rocm-docs-core==1.4.1
# via -r requirements.in # via -r requirements.in
six==1.16.0 six==1.16.0
# via # via
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/host_utility/device_prop.hpp"
struct AlphaBetaAdd struct AlphaBetaAdd
{ {
...@@ -175,6 +176,14 @@ int main(int argc, char* argv[]) ...@@ -175,6 +176,14 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals; using namespace ck::literals;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/host_utility/device_prop.hpp"
struct AlphaBetaAdd struct AlphaBetaAdd
{ {
...@@ -175,6 +176,14 @@ int main(int argc, char* argv[]) ...@@ -175,6 +176,14 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals; using namespace ck::literals;
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common_wmma.hpp" #include "common_wmma.hpp"
#include "ck/host_utility/device_prop.hpp"
// kernel data types // kernel data types
using InKernelDataType = FP16; using InKernelDataType = FP16;
...@@ -23,4 +24,14 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; ...@@ -23,4 +24,14 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc" #include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv);
}
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_wmma.hpp" #include "common_wmma.hpp"
#include "ck/host_utility/device_prop.hpp"
// kernel data types // kernel data types
using InKernelDataType = I8; using InKernelDataType = I8;
...@@ -23,4 +24,14 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; ...@@ -23,4 +24,14 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc" #include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv);
}
...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g ...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/host_utility/device_prop.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -163,4 +164,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< ...@@ -163,4 +164,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
#include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc" #include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc"
int main(int argc, char* argv[]) { return run(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run(argc, argv);
}
...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g ...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/host_utility/device_prop.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -285,4 +286,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< ...@@ -285,4 +286,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
#include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc" #include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc"
int main(int argc, char* argv[]) { return run(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run(argc, argv);
}
...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g ...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/host_utility/device_prop.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -351,4 +352,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< ...@@ -351,4 +352,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
#include "run_cross_attention_wmma.inc" #include "run_cross_attention_wmma.inc"
int main(int argc, char* argv[]) { return run(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run(argc, argv);
}
...@@ -28,6 +28,7 @@ Example is GQA-4 ...@@ -28,6 +28,7 @@ Example is GQA-4
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/host_utility/device_prop.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -299,4 +300,14 @@ using ReferenceGemm1Instance = ...@@ -299,4 +300,14 @@ using ReferenceGemm1Instance =
#include "run_grouped_query_attention_forward_wmma.inc" #include "run_grouped_query_attention_forward_wmma.inc"
int main(int argc, char* argv[]) { return run(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run(argc, argv);
}
...@@ -26,6 +26,7 @@ Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.” ...@@ -26,6 +26,7 @@ Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.”
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/host_utility/device_prop.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -284,4 +285,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm_ ...@@ -284,4 +285,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm_
#include "run_multi_query_attention_forward_wmma.inc" #include "run_multi_query_attention_forward_wmma.inc"
int main(int argc, char* argv[]) { return run(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run(argc, argv);
}
...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g ...@@ -27,6 +27,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/host_utility/device_prop.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -329,4 +330,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< ...@@ -329,4 +330,14 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
#include "run_self_attention_wmma.inc" #include "run_self_attention_wmma.inc"
int main(int argc, char* argv[]) { return run(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run(argc, argv);
}
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
#include "common.hpp" #include "common.hpp"
#include "ck/host_utility/device_prop.hpp"
using OutDataType = FP16; using OutDataType = FP16;
using WeiDataType = FP16; using WeiDataType = FP16;
...@@ -31,4 +32,14 @@ using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDat ...@@ -31,4 +32,14 @@ using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDat
#include "run_grouped_conv_bwd_data_example.inc" #include "run_grouped_conv_bwd_data_example.inc"
int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_example(argc, argv); } int main(int argc, char* argv[])
{
bool is_supported = ck::is_gfx11_supported();
if(!is_supported)
{
std::cout << "WARNING: wmma example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
return run_grouped_conv_bwd_data_example(argc, argv);
}
...@@ -181,7 +181,7 @@ endfunction(add_example_executable_no_testing EXAMPLE_NAME) ...@@ -181,7 +181,7 @@ endfunction(add_example_executable_no_testing EXAMPLE_NAME)
# add all example subdir # add all example subdir
file(GLOB dir_list LIST_DIRECTORIES true *) file(GLOB dir_list LIST_DIRECTORIES true *)
FOREACH(subdir ${dir_list}) FOREACH(subdir ${dir_list})
IF(IS_DIRECTORY "${subdir}") if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
add_subdirectory(${subdir}) add_subdirectory(${subdir})
ENDIF() ENDIF()
ENDFOREACH() ENDFOREACH()
...@@ -47,12 +47,12 @@ __global__ void ...@@ -47,12 +47,12 @@ __global__ void
#endif #endif
kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3( kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
typename GridwiseGemm::Argument karg, typename GridwiseGemm::Argument karg,
const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1, [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1, [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
const index_t num_k_per_block) [[maybe_unused]] const index_t num_k_per_block)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx94__)) defined(__gfx94__))
...@@ -103,12 +103,12 @@ __global__ void ...@@ -103,12 +103,12 @@ __global__ void
#endif #endif
kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds( kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
typename GridwiseGemm::Argument karg, typename GridwiseGemm::Argument karg,
const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1, [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1, [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
const index_t num_k_per_block) [[maybe_unused]] const index_t num_k_per_block)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
......
...@@ -69,14 +69,15 @@ __global__ void ...@@ -69,14 +69,15 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
#endif #endif
kernel_grouped_conv_fwd_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg, kernel_grouped_conv_fwd_xdl_cshuffle_v3(
const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1, typename GridwiseGemm::Argument karg,
const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1, [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
[[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffset compute_ptr_offset_of_groups, [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
const ComputePtrOffset compute_ptr_offset_of_n, [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
const index_t groups_count) [[maybe_unused]] const index_t groups_count)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
// offset base pointer for each work-group // offset base pointer for each work-group
...@@ -132,13 +133,13 @@ __global__ void ...@@ -132,13 +133,13 @@ __global__ void
#endif #endif
kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds( kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds(
typename GridwiseGemm::Argument karg, typename GridwiseGemm::Argument karg,
const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1, [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1, [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffset compute_ptr_offset_of_groups, [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
const ComputePtrOffset compute_ptr_offset_of_n, [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
const index_t groups_count) [[maybe_unused]] const index_t groups_count)
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__)) #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
// offset base pointer for each work-group // offset base pointer for each work-group
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment