Commit 6975cb8f authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into ck_tile/fav3_fwd_sept

parents 33aff2ef 6834e5ee
...@@ -26,11 +26,15 @@ set(version 1.1.0) ...@@ -26,11 +26,15 @@ set(version 1.1.0)
project(composable_kernel VERSION ${version} LANGUAGES CXX HIP) project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
include(CTest) include(CTest)
# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
# CK Codegen requires dataclass which is added in Python 3.7
# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
if(NOT CK_USE_ALTERNATIVE_PYTHON) if(NOT CK_USE_ALTERNATIVE_PYTHON)
find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) find_package(Python3 3.8 COMPONENTS Interpreter REQUIRED)
else() else()
message("Using alternative python version") message("Using alternative python version")
set(EXTRA_PYTHON_PATH) set(EXTRA_PYTHON_PATH)
# this is overly restrictive, we may need to be more flexible on the following
string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}") string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
message("alternative python path is: ${EXTRA_PYTHON_PATH}") message("alternative python path is: ${EXTRA_PYTHON_PATH}")
find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
......
...@@ -703,7 +703,7 @@ def process_results(Map conf=[:]){ ...@@ -703,7 +703,7 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=;RUN_CK_TILE_GEMM_TESTS=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
...@@ -775,9 +775,13 @@ pipeline { ...@@ -775,9 +775,13 @@ pipeline {
defaultValue: false, defaultValue: false,
description: "Run the grouped conv large cases tests (default: OFF)") description: "Run the grouped conv large cases tests (default: OFF)")
booleanParam( booleanParam(
name: "RUN_CK_TILE_TESTS", name: "RUN_CK_TILE_FMHA_TESTS",
defaultValue: false, defaultValue: false,
description: "Run the ck_tile tests (default: OFF)") description: "Run the ck_tile FMHA tests (default: OFF)")
booleanParam(
name: "RUN_CK_TILE_GEMM_TESTS",
defaultValue: false,
description: "Run the ck_tile GEMM tests (default: OFF)")
booleanParam( booleanParam(
name: "BUILD_INSTANCES_ONLY", name: "BUILD_INSTANCES_ONLY",
defaultValue: false, defaultValue: false,
...@@ -894,15 +898,15 @@ pipeline { ...@@ -894,15 +898,15 @@ pipeline {
} }
} }
} }
stage("Run CK_TILE Tests") stage("Run CK_TILE_FMHA Tests")
{ {
parallel parallel
{ {
stage("Run CK_TILE Tests on gfx90a") stage("Run CK_TILE_FMHA Tests on gfx90a")
{ {
when { when {
beforeAgent true beforeAgent true
expression { params.RUN_CK_TILE_TESTS.toBoolean() } expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
} }
agent{ label rocmnode("gfx90a") } agent{ label rocmnode("gfx90a") }
environment{ environment{
...@@ -917,11 +921,11 @@ pipeline { ...@@ -917,11 +921,11 @@ pipeline {
cleanWs() cleanWs()
} }
} }
stage("Run CK_TILE Tests on gfx942") stage("Run CK_TILE_FMHA Tests on gfx942")
{ {
when { when {
beforeAgent true beforeAgent true
expression { params.RUN_CK_TILE_TESTS.toBoolean() } expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
} }
agent{ label rocmnode("gfx942") } agent{ label rocmnode("gfx942") }
environment{ environment{
...@@ -937,6 +941,52 @@ pipeline { ...@@ -937,6 +941,52 @@ pipeline {
} }
} }
} }
}
stage("Run CK_TILE_GEMM Tests")
{
parallel
{
stage("Run CK_TILE_GEMM Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_gemm_basic && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
stage("Run CK_TILE_GEMM Tests on gfx942")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_gemm_basic && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
} }
stage("Build CK and run Tests") stage("Build CK and run Tests")
{ {
......
...@@ -6,6 +6,9 @@ if(NOT INSTANCES_ONLY) ...@@ -6,6 +6,9 @@ if(NOT INSTANCES_ONLY)
set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP) set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
add_executable(codegen_test_${BASE_NAME} ${TEST_SRC}) add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
if(CK_USE_ALTERNATIVE_PYTHON)
target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs)
endif()
add_dependencies(codegen codegen_test_${BASE_NAME}) add_dependencies(codegen codegen_test_${BASE_NAME})
add_dependencies(tests codegen_test_${BASE_NAME}) add_dependencies(tests codegen_test_${BASE_NAME})
add_dependencies(check codegen_test_${BASE_NAME}) add_dependencies(check codegen_test_${BASE_NAME})
......
...@@ -2,14 +2,14 @@ ...@@ -2,14 +2,14 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp> #include <rtc/kernel.hpp>
#include <filesystem> #include <ck/filesystem.hpp>
#include <string> #include <string>
namespace rtc { namespace rtc {
struct src_file struct src_file
{ {
std::filesystem::path path; CK::fs::path path;
std::string_view content; std::string_view content;
}; };
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string> #include <string>
#include <filesystem> #include <ck/filesystem.hpp>
namespace rtc { namespace rtc {
struct tmp_dir struct tmp_dir
{ {
std::filesystem::path path; CK::fs::path path;
tmp_dir(const std::string& prefix = ""); tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const; void execute(const std::string& cmd) const;
......
...@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options ...@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
for(const auto& src : srcs) for(const auto& src : srcs)
{ {
std::filesystem::path full_path = td.path / src.path; CK::fs::path full_path = td.path / src.path;
std::filesystem::path parent_path = full_path.parent_path(); CK::fs::path parent_path = full_path.parent_path();
std::filesystem::create_directories(parent_path); CK::fs::create_directories(parent_path);
write_string(full_path.string(), src.content); write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp") if(src.path.extension().string() == ".cpp")
{ {
...@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options ...@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
td.execute(compiler() + options.flags); td.execute(compiler() + options.flags);
auto out_path = td.path / out; auto out_path = td.path / out;
if(not std::filesystem::exists(out_path)) if(not CK::fs::exists(out_path))
throw std::runtime_error("Output file missing: " + out); throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string()); auto obj = read_buffer(out_path.string());
......
...@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix) ...@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
} }
tmp_dir::tmp_dir(const std::string& prefix) tmp_dir::tmp_dir(const std::string& prefix)
: path(std::filesystem::temp_directory_path() / : path(CK::fs::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix)) unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{ {
std::filesystem::create_directories(this->path); CK::fs::create_directories(this->path);
} }
void tmp_dir::execute(const std::string& cmd) const void tmp_dir::execute(const std::string& cmd) const
...@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const ...@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
std::system(s.c_str()); std::system(s.c_str());
} }
tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); } tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }
} // namespace rtc } // namespace rtc
rocm-docs-core==1.7.2 rocm-docs-core==1.7.2
sphinxcontrib-bibtex==2.6.2 sphinxcontrib-bibtex==2.6.3
...@@ -137,7 +137,7 @@ sphinx-notfound-page==1.0.3 ...@@ -137,7 +137,7 @@ sphinx-notfound-page==1.0.3
# via rocm-docs-core # via rocm-docs-core
sphinxcontrib-applehelp==2.0.0 sphinxcontrib-applehelp==2.0.0
# via sphinx # via sphinx
sphinxcontrib-bibtex==2.6.2 sphinxcontrib-bibtex==2.6.3
# via -r requirements.in # via -r requirements.in
sphinxcontrib-devhelp==2.0.0 sphinxcontrib-devhelp==2.0.0
# via sphinx # via sphinx
......
...@@ -22,12 +22,20 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}") ...@@ -22,12 +22,20 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
execute_process( execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
--api ${FMHA_FWD_APIS} --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt --api ${FMHA_FWD_APIS} --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
RESULT_VARIABLE ret
) )
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
endif()
execute_process( execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
--api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3 --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3
RESULT_VARIABLE ret
) )
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
endif()
# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory # NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
# as current cmake list, otherwise will not figure out the dependency properly # as current cmake list, otherwise will not figure out the dependency properly
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_basic.hpp" #include "gemm_basic.hpp"
#include "ck_tile/host.hpp" #include <hip/hip_runtime.h>
#include <cstring> #include <cstring>
#include <iostream> #include <iostream>
...@@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[]) ...@@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[])
.insert("stride_a", "0", "Tensor A stride") .insert("stride_a", "0", "Tensor A stride")
.insert("stride_b", "0", "Tensor B stride") .insert("stride_b", "0", "Tensor B stride")
.insert("stride_c", "0", "Tensor C stride") .insert("stride_c", "0", "Tensor C stride")
.insert("v", "1", "cpu validation or not") .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
.insert("e", "1e-5", "Absolute error tolerance") .insert("e", "1e-5", "Absolute error tolerance")
.insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
.insert("warmup", "10", "number of iterations before benchmark the kernel") .insert("warmup", "10", "number of iterations before benchmark the kernel")
...@@ -32,41 +32,22 @@ auto create_args(int argc, char* argv[]) ...@@ -32,41 +32,22 @@ auto create_args(int argc, char* argv[])
return std::make_tuple(result, arg_parser); return std::make_tuple(result, arg_parser);
} }
template <typename LayoutA, typename LayoutB, typename LayoutC> template <typename LayoutA,
typename LayoutB,
typename LayoutC,
typename PipelineProblem,
typename GemmPipeline,
typename GemmShape>
float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
{ {
// ToDo: This will be modified by the codegen code later.
constexpr ck_tile::index_t M_Tile = 128;
constexpr ck_tile::index_t N_Tile = 128;
constexpr ck_tile::index_t K_Tile = 32;
constexpr ck_tile::index_t M_Warp = 2;
constexpr ck_tile::index_t N_Warp = 2;
constexpr ck_tile::index_t K_Warp = 1;
constexpr ck_tile::index_t M_Warp_Tile = 32;
constexpr ck_tile::index_t N_Warp_Tile = 32;
constexpr ck_tile::index_t K_Warp_Tile = 8;
// The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
constexpr bool kPadA = true; constexpr bool kPadA = true;
constexpr bool kPadB = true; constexpr bool kPadB = true;
constexpr bool kPadC = false;
constexpr int kBlockPerCu = 1; constexpr int kBlockPerCu = 1;
// ===============================================
using GemmShape =
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>; using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
using PipelineProblem = ck_tile:: using GemmEpilogue = ck_tile::Default2DEpilogue<
BlockGemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, kPadA, kPadB, kPadC>;
// The GemmPipeline should also come from the Codegen.
using GemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<PipelineProblem>;
using GemmEpilogue = ck_tile::Default2DEpilogue<
ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>; ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>;
// ToDo: Will add the codegen part to test different pipeline policies in GEMM. // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
// Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
...@@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) ...@@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
return ave_time; return ave_time;
} }
template <typename DataType, typename LayoutA, typename LayoutB, typename LayoutC> template <typename DataType,
typename LayoutA,
typename LayoutB,
typename LayoutC,
typename PipelineProblem,
typename GemmPipeline,
typename GemmShape>
float invoke_gemm(ck_tile::DeviceMem& a_buf, float invoke_gemm(ck_tile::DeviceMem& a_buf,
ck_tile::DeviceMem& b_buf, ck_tile::DeviceMem& b_buf,
ck_tile::DeviceMem& c_buf, ck_tile::DeviceMem& c_buf,
...@@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf, ...@@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
else else
{ {
args.stride_B = [&]() { args.stride_B = [&]() {
if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::ColumnMajor>) if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::RowMajor>)
{ {
return N; return N;
} }
...@@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf, ...@@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
}(); }();
} }
float ave_time = float ave_time = gemm_calc<LayoutA, LayoutB, LayoutC, PipelineProblem, GemmPipeline, GemmShape>(
gemm_calc<LayoutA, LayoutB, LayoutC>(args, ck_tile::stream_config{nullptr, true}); args, ck_tile::stream_config{nullptr, true});
std::size_t num_byte = std::size_t num_byte =
sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
float gb_per_sec = num_byte / 1.E6 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time;
...@@ -212,7 +199,7 @@ int main(int argc, char* argv[]) ...@@ -212,7 +199,7 @@ int main(int argc, char* argv[])
// The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N). // The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor; using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor;
using matrix_b_layout = ck_tile::tensor_layout::gemm::RowMajor; using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor;
using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor; using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor;
// host verify // host verify
...@@ -221,7 +208,7 @@ int main(int argc, char* argv[]) ...@@ -221,7 +208,7 @@ int main(int argc, char* argv[])
? std::vector<int>{M, K} ? std::vector<int>{M, K}
: std::vector<int>{K, M}; : std::vector<int>{K, M};
std::vector<int> b_dimensions = std::vector<int> b_dimensions =
(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>) (std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
? std::vector<int>{N, K} ? std::vector<int>{N, K}
: std::vector<int>{K, N}; : std::vector<int>{K, N};
std::vector<int> c_dimensions = std::vector<int> c_dimensions =
...@@ -245,12 +232,52 @@ int main(int argc, char* argv[]) ...@@ -245,12 +232,52 @@ int main(int argc, char* argv[])
a_buf.ToDevice(a_host.data()); a_buf.ToDevice(a_host.data());
b_buf.ToDevice(b_host.data()); b_buf.ToDevice(b_host.data());
invoke_gemm<ck_tile::half_t, matrix_a_layout, matrix_b_layout, matrix_c_layout>( // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
a_buf, b_buf, c_buf, arg_parser); constexpr bool kPadA = true;
constexpr bool kPadB = true;
constexpr bool kPadC = false;
bool pass = true; // This part comes from the Codegen
constexpr ck_tile::index_t M_Tile = 128;
constexpr ck_tile::index_t N_Tile = 128;
constexpr ck_tile::index_t K_Tile = 32;
if(arg_parser.get_bool("v")) constexpr ck_tile::index_t M_Warp = 2;
constexpr ck_tile::index_t N_Warp = 2;
constexpr ck_tile::index_t K_Warp = 1;
constexpr ck_tile::index_t M_Warp_Tile = 32;
constexpr ck_tile::index_t N_Warp_Tile = 32;
constexpr ck_tile::index_t K_Warp_Tile = 8;
using CodegenGemmShape =
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
using CodegenPipelineProblem = ck_tile::BlockGemmPipelineProblem<ADataType,
BDataType,
AccDataType,
CodegenGemmShape,
kPadA,
kPadB,
kPadC>;
using CodegenGemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
invoke_gemm<ck_tile::half_t,
matrix_a_layout,
matrix_b_layout,
matrix_c_layout,
CodegenPipelineProblem,
CodegenGemmPipeline,
CodegenGemmShape>(a_buf, b_buf, c_buf, arg_parser);
c_buf.FromDevice(c_host_dev.data());
bool pass_cpu = true;
if(arg_parser.get_int("v") == 1)
{ {
// ToDo: Will Add the Element Op (bias) verification in the future. // ToDo: Will Add the Element Op (bias) verification in the future.
ck_tile::reference_gemm<ADataType, ck_tile::reference_gemm<ADataType,
...@@ -261,14 +288,71 @@ int main(int argc, char* argv[]) ...@@ -261,14 +288,71 @@ int main(int argc, char* argv[])
matrix_b_layout, matrix_b_layout,
matrix_c_layout>(a_host, b_host, c_host_ref); matrix_c_layout>(a_host, b_host, c_host_ref);
c_buf.FromDevice(c_host_dev.data()); pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref);
std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail")
<< std::flush;
}
bool pass_gpu = true;
if(arg_parser.get_int("v") == 2)
{
ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
if(stride_a == 0)
{
if constexpr(std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
{
stride_a = M;
}
else
{
stride_a = K;
}
}
if(stride_b == 0)
{
if constexpr(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
{
stride_b = N;
}
else
{
stride_b = K;
}
}
if(stride_c == 0)
{
if constexpr(std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
{
stride_c = M;
}
else
{
stride_c = N;
}
}
ck_tile::HostTensor<CDataType> c_host_gpu_ref(c_dimensions);
ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes());
ck_tile::reference_gemm_gpu<ADataType, BDataType, AccDataType, CDataType>(
a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c);
c_buf.FromDevice(c_host_gpu_ref.data());
pass = ck_tile::check_err(c_host_dev, c_host_ref); pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref);
std::cout << "The veification result is:" << (pass ? "correct" : "fail") << std::flush; std::cout << "The GPU veification result is:" << (pass_gpu ? "correct" : "fail")
<< std::flush;
} }
std::cout << std::endl << std::flush; std::cout << std::endl << std::flush;
return !pass; return !pass_gpu;
} }
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/gemm.hpp" #include "ck_tile/ops/gemm.hpp"
#include "ck_tile/host.hpp"
#include <string> #include <string>
template <typename DataType> template <typename DataType>
......
#!/bin/bash
#
# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/
#
# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
# input arguments:
# environment tag : a string describing the specifics of your test environment
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# host name : $hostname
# gpu architecture: e.g., gfx90a, or gfx942, etc.
# get the command line arguments:
export env_type=$1
echo 'Environment type: ' $env_type
export branch=$2
echo 'Branch name: ' $branch
export host_name=$3
echo 'Host name: ' $host_name
export GPU_arch=$4
echo 'GPU_arch: ' $GPU_arch
# run verification tests
example/ck_tile/03_gemm/script/smoke_test.sh
# We do not have a performance benchmark for gemm yet. Will add it in the future.
\ No newline at end of file
#!/bin/bash
EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
KNAME=1
export CK_WARMUP=0
export CK_REPEAT=1
COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
run_fp16_tests() {
for batch in 1 2; do
for m in 128 1024; do
for n in 128 2048; do
for k in 32 64; do
$EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS
if [ $? -eq 0 ]; then
echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully."
else
echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly."
# Optionally, exit or break if you need to halt further execution
# exit 1
fi
done
done
done
done
}
set -x
run_fp16_tests
set +x
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_CK_FILESYSTEM_HPP_
#define GUARD_CK_FILESYSTEM_HPP_
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define CK_HAS_FILESYSTEM 1
#define CK_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define CK_HAS_FILESYSTEM 1
#define CK_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define CK_HAS_FILESYSTEM 0
#define CK_HAS_FILESYSTEM_TS 1
#else
#define CK_HAS_FILESYSTEM 0
#define CK_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define CK_HAS_FILESYSTEM 1
#else
#define CK_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define CK_HAS_FILESYSTEM_TS 1
#else
#define CK_HAS_FILESYSTEM_TS 0
#endif
#else
#define CK_HAS_FILESYSTEM 0
#define CK_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if CK_HAS_FILESYSTEM
#include <filesystem>
#elif CK_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace CK {
#if CK_HAS_FILESYSTEM
namespace fs = ::std::filesystem;
#elif CK_HAS_FILESYSTEM_TS
namespace fs = ::std::experimental::filesystem;
#endif
} // namespace CK
inline std::string operator+(const std::string_view s, const CK::fs::path& path)
{
return path.string().insert(0, s);
}
inline std::string operator+(const CK::fs::path& path, const std::string_view s)
{
return path.string().append(s);
}
#define FS_ENUM_PERMS_ALL fs::perms::all
#if CK_HAS_FILESYSTEM_TS
#ifdef __linux__
#include <linux/limits.h>
namespace CK {
inline fs::path weakly_canonical(const fs::path& path)
{
std::string result(PATH_MAX, '\0');
std::string p{path.is_relative() ? (fs::current_path() / path).string() : path.string()};
char* retval = realpath(p.c_str(), &result[0]);
return (retval == nullptr) ? path : fs::path{result};
}
} // namespace CK
#else
#error "Not implmeneted!"
#endif
#else
namespace CK {
inline fs::path weakly_canonical(const fs::path& path) { return fs::weakly_canonical(path); }
} // namespace CK
#endif
namespace CK {
#ifdef _WIN32
constexpr std::string_view executable_postfix{".exe"};
constexpr std::string_view library_prefix{""};
constexpr std::string_view dynamic_library_postfix{".dll"};
constexpr std::string_view static_library_postfix{".lib"};
constexpr std::string_view object_file_postfix{".obj"};
#else
constexpr std::string_view executable_postfix{""};
constexpr std::string_view library_prefix{"lib"};
constexpr std::string_view dynamic_library_postfix{".so"};
constexpr std::string_view static_library_postfix{".a"};
constexpr std::string_view object_file_postfix{".o"};
#endif
inline fs::path make_executable_name(const fs::path& path)
{
return path.parent_path() / (path.filename() + executable_postfix);
}
inline fs::path make_dynamic_library_name(const fs::path& path)
{
return path.parent_path() / (library_prefix + path.filename() + dynamic_library_postfix);
}
inline fs::path make_object_file_name(const fs::path& path)
{
return path.parent_path() / (path.filename() + object_file_postfix);
}
inline fs::path make_static_library_name(const fs::path& path)
{
return path.parent_path() / (library_prefix + path.filename() + static_library_postfix);
}
struct FsPathHash
{
std::size_t operator()(const fs::path& path) const { return fs::hash_value(path); }
};
} // namespace CK
#endif // GUARD_CK_FILESYSTEM_HPP_
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "ck_tile/core.hpp" #include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp" #include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/ops/common.hpp" #include "ck_tile/ops/common/tensor_layout.hpp"
#include <thread> #include <thread>
namespace ck_tile { namespace ck_tile {
...@@ -57,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k, ...@@ -57,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency());
} }
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
__global__ void naive_gemm_kernel(ADataType* A,
BDataType* B,
CDataType* C,
ck_tile::index_t M,
ck_tile::index_t N,
ck_tile::index_t K,
ck_tile::index_t strideA,
ck_tile::index_t strideB,
ck_tile::index_t strideC)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int row = idx / N; // Compute row index
int col = idx % N; // Compute column index
if(row < M && col < N)
{
AccDataType acc = 0.0;
for(int k = 0; k < K; ++k)
{
acc += static_cast<AccDataType>(A[row * strideA + k]) *
static_cast<AccDataType>(B[col * strideB + k]);
}
C[row * strideC + col] = acc; // Store as AccDataType
}
}
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
void reference_gemm_gpu(DeviceMem& a_device,
DeviceMem& b_device,
DeviceMem& c_device,
index_t M,
index_t N,
index_t K,
index_t stride_a,
index_t stride_b,
index_t stride_c)
{
ADataType* d_A;
BDataType* d_B;
CDataType* d_C;
hipError_t errA = hipMalloc(&d_A, M * K * sizeof(ADataType));
hipError_t errB = hipMalloc(&d_B, N * K * sizeof(BDataType));
hipError_t errC = hipMalloc(&d_C, M * N * sizeof(CDataType));
if(errA != hipSuccess)
{
std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
<< std::endl;
return; // Early exit on error
}
if(errB != hipSuccess)
{
std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
<< std::endl;
return; // Early exit on error
}
if(errC != hipSuccess)
{
std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
<< std::endl;
return; // Early exit on error
}
errA = hipMemcpy(
d_A, a_device.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice);
if(errA != hipSuccess)
{
std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
}
errB = hipMemcpy(
d_B, b_device.GetDeviceBuffer(), N * K * sizeof(BDataType), hipMemcpyHostToDevice);
if(errB != hipSuccess)
{
std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
}
int totalElements = M * N;
int numThreadsPerBlock = 256; // Common choice for threads per block
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType>
<<<numBlocks, numThreadsPerBlock>>>(d_A, d_B, d_C, M, N, K, stride_a, stride_b, stride_c);
errC = hipMemcpy(
c_device.GetDeviceBuffer(), d_C, M * N * sizeof(CDataType), hipMemcpyDeviceToHost);
if(errC != hipSuccess)
{
std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
}
errA = hipFree(d_A);
if(errA != hipSuccess)
{
std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
}
errB = hipFree(d_B);
if(errB != hipSuccess)
{
std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
}
errC = hipFree(d_C);
if(errC != hipSuccess)
{
std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
}
return;
}
} // namespace ck_tile } // namespace ck_tile
...@@ -76,8 +76,7 @@ struct GemmKernel ...@@ -76,8 +76,7 @@ struct GemmKernel
CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
{ {
const index_t i_m = TilePartitioner::iM; const auto [i_m, i_n] = TilePartitioner{}();
const index_t i_n = TilePartitioner::iN;
// options // options
const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr); const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr); const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
...@@ -104,7 +103,7 @@ struct GemmKernel ...@@ -104,7 +103,7 @@ struct GemmKernel
}(); }();
auto b_tensor_view = [&]() { auto b_tensor_view = [&]() {
if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>) if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::RowMajor>)
{ {
return make_naive_tensor_view<address_space_enum::global>( return make_naive_tensor_view<address_space_enum::global>(
b_start, b_start,
......
...@@ -15,9 +15,6 @@ struct GemmTilePartitioner ...@@ -15,9 +15,6 @@ struct GemmTilePartitioner
static constexpr ck_tile::index_t kN = BlockGemmShape::kN; static constexpr ck_tile::index_t kN = BlockGemmShape::kN;
static constexpr ck_tile::index_t kK = BlockGemmShape::kK; static constexpr ck_tile::index_t kK = BlockGemmShape::kK;
const index_t iM = __builtin_amdgcn_readfirstlane(i_tile_m * kM);
const index_t iN = __builtin_amdgcn_readfirstlane(i_tile_n * kN);
CK_TILE_HOST static constexpr auto CK_TILE_HOST static constexpr auto
GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size) GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size)
{ {
...@@ -29,10 +26,9 @@ struct GemmTilePartitioner ...@@ -29,10 +26,9 @@ struct GemmTilePartitioner
CK_TILE_DEVICE auto operator()() CK_TILE_DEVICE auto operator()()
{ {
const index_t i_GridDimX = blockIdx.x; const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM);
const index_t i_GridDimY = blockIdx.y; const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN);
const index_t i_GridDimZ = blockIdx.z; return ck_tile::make_tuple(iM, iN);
return ck_tile::make_tuple(i_GridDimX, i_GridDimY, i_GridDimZ);
} }
}; };
} // namespace ck_tile } // namespace ck_tile
...@@ -23,6 +23,10 @@ void add_device_maxpool_bwd_bf16_instances( ...@@ -23,6 +23,10 @@ void add_device_maxpool_bwd_bf16_instances(
void add_device_maxpool_bwd_f32_instances( void add_device_maxpool_bwd_f32_instances(
std::vector<std::unique_ptr<DeviceMaxPoolBwd<F32, I32, F32>>>&); std::vector<std::unique_ptr<DeviceMaxPoolBwd<F32, I32, F32>>>&);
#endif #endif
#ifdef CK_ENABLE_FP8
void add_device_maxpool_bwd_f8_instances(
std::vector<std::unique_ptr<DeviceMaxPoolBwd<F8, I32, F8>>>&);
#endif
#ifdef CK_ENABLE_INT8 #ifdef CK_ENABLE_INT8
void add_device_maxpool_bwd_int8_instances( void add_device_maxpool_bwd_int8_instances(
std::vector<std::unique_ptr<DeviceMaxPoolBwd<I8, I32, I8>>>&); std::vector<std::unique_ptr<DeviceMaxPoolBwd<I8, I32, I8>>>&);
...@@ -53,6 +57,11 @@ struct DeviceOperationInstanceFactory< ...@@ -53,6 +57,11 @@ struct DeviceOperationInstanceFactory<
is_same_v<IndexDataType, I32>) is_same_v<IndexDataType, I32>)
add_device_maxpool_bwd_f32_instances(op_ptrs); add_device_maxpool_bwd_f32_instances(op_ptrs);
#endif #endif
#ifdef CK_ENABLE_FP8
else if constexpr(is_same_v<DOutDataType, F8> && is_same_v<DInDataType, F8> &&
is_same_v<IndexDataType, I32>)
add_device_maxpool_bwd_f8_instances(op_ptrs);
#endif
#ifdef CK_ENABLE_INT8 #ifdef CK_ENABLE_INT8
else if constexpr(is_same_v<DOutDataType, I8> && is_same_v<DInDataType, I8> && else if constexpr(is_same_v<DOutDataType, I8> && is_same_v<DInDataType, I8> &&
is_same_v<IndexDataType, I32>) is_same_v<IndexDataType, I32>)
......
...@@ -67,6 +67,36 @@ void add_device_pool2d_fwd_nhwc_index_f32_instances( ...@@ -67,6 +67,36 @@ void add_device_pool2d_fwd_nhwc_index_f32_instances(
std::vector<std::unique_ptr< std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NHWC, NHWC, MaxOp, true>>>&); DevicePoolFwd<InOutRank, WindowRank, F32, F32, I32, NHWC, NHWC, MaxOp, true>>>&);
#endif #endif
#ifdef CK_ENABLE_INT8
// I8
void add_device_pool2d_fwd_nhwc_i8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NHWC, NHWC, MaxOp, false>>>&);
void add_device_pool2d_fwd_nhwc_i8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NHWC, NHWC, AvgOp, false>>>&);
// I8 - return index
void add_device_pool2d_fwd_nhwc_index_i8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, I8, I8, I32, NHWC, NHWC, MaxOp, true>>>&);
#endif
#ifdef CK_ENABLE_FP8
// F8
void add_device_pool2d_fwd_nhwc_f8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NHWC, NHWC, MaxOp, false>>>&);
void add_device_pool2d_fwd_nhwc_f8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NHWC, NHWC, AvgOp, false>>>&);
// F8 - return index
void add_device_pool2d_fwd_nhwc_index_f8_instances(
std::vector<std::unique_ptr<
DevicePoolFwd<InOutRank, WindowRank, F8, F8, I32, NHWC, NHWC, MaxOp, true>>>&);
#endif
template <typename InDataType, template <typename InDataType,
typename OutDataType, typename OutDataType,
typename IndexDataType, typename IndexDataType,
...@@ -140,6 +170,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw ...@@ -140,6 +170,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
add_device_pool2d_fwd_nhwc_f32_instances(op_ptrs); add_device_pool2d_fwd_nhwc_f32_instances(op_ptrs);
} }
} }
#endif
#ifdef CK_ENABLE_INT8
else if constexpr(is_same_v<InDataType, I8> && is_same_v<OutDataType, I8> &&
is_same_v<IndexDataType, I32>)
{
if constexpr(OutputIndex && ReduceOpId == MaxOp)
{
add_device_pool2d_fwd_nhwc_index_i8_instances(op_ptrs);
}
else
{
add_device_pool2d_fwd_nhwc_i8_instances(op_ptrs);
}
}
#endif
#ifdef CK_ENABLE_FP8
else if constexpr(is_same_v<InDataType, F8> && is_same_v<OutDataType, F8> &&
is_same_v<IndexDataType, I32>)
{
if constexpr(OutputIndex && ReduceOpId == MaxOp)
{
add_device_pool2d_fwd_nhwc_index_f8_instances(op_ptrs);
}
else
{
add_device_pool2d_fwd_nhwc_f8_instances(op_ptrs);
}
}
#endif #endif
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment