Unverified Commit 32371ea5 authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge branch 'develop' into navi3_rel

parents e42f9ecf 0e28de97
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <atomic>
#include <algorithm>
#include <array>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <functional>
#include <iostream>
#include <sstream>
#include <type_traits>
#include <unordered_map>
#include <vector>
#ifdef __linux__
#include <unistd.h>
#endif
#ifndef MIGRAPHX_GUARD_TEST_TEST_HPP
#define MIGRAPHX_GUARD_TEST_TEST_HPP
namespace test {
// clang-format off
// NOLINTNEXTLINE
#define TEST_FOREACH_BINARY_OPERATORS(m) \
m(==, equal) \
m(!=, not_equal) \
m(<=, less_than_equal) \
m(>=, greater_than_equal) \
m(<, less_than) \
m(>, greater_than) \
m(and, and_op) \
m(or, or_op)
// clang-format on
// clang-format off
// NOLINTNEXTLINE
#define TEST_FOREACH_UNARY_OPERATORS(m) \
m(not, not_op)
// clang-format on
// NOLINTNEXTLINE
#define TEST_EACH_BINARY_OPERATOR_OBJECT(op, name) \
struct name \
{ \
static std::string as_string() { return #op; } \
template <class T, class U> \
static decltype(auto) call(T&& x, U&& y) \
{ \
return x op y; \
} \
};
// NOLINTNEXTLINE
#define TEST_EACH_UNARY_OPERATOR_OBJECT(op, name) \
struct name \
{ \
static std::string as_string() { return #op; } \
template <class T> \
static decltype(auto) call(T&& x) \
{ \
return op x; \
} \
};
TEST_FOREACH_BINARY_OPERATORS(TEST_EACH_BINARY_OPERATOR_OBJECT)
TEST_FOREACH_UNARY_OPERATORS(TEST_EACH_UNARY_OPERATOR_OBJECT)
struct nop
{
static std::string as_string() { return ""; }
template <class T>
static auto call(T&& x)
{
return static_cast<T&&>(x);
}
};
struct function
{
static std::string as_string() { return ""; }
template <class T>
static decltype(auto) call(T&& x)
{
return x();
}
};
template <class Stream, class Iterator>
Stream& stream_range(Stream& s, Iterator start, Iterator last);
template <class Stream>
inline Stream& operator<<(Stream& s, std::nullptr_t)
{
s << "nullptr";
return s;
}
template <class Stream,
class Range,
class = typename std::enable_if<not std::is_convertible<Range, std::string>{}>::type>
inline auto operator<<(Stream& s, const Range& v) -> decltype(stream_range(s, v.begin(), v.end()))
{
s << "{ ";
stream_range(s, v.begin(), v.end());
s << "}";
return s;
}
template <class Stream, class Iterator>
inline Stream& stream_range(Stream& s, Iterator start, Iterator last)
{
if(start != last)
{
s << *start;
std::for_each(std::next(start), last, [&](auto&& x) { s << ", " << x; });
}
return s;
}
template <class T>
const T& get_value(const T& x)
{
return x;
}
template <class T, class Operator = nop>
struct lhs_expression;
template <class T>
lhs_expression<T> make_lhs_expression(T&& lhs);
template <class T, class Operator>
lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator);
// NOLINTNEXTLINE
#define TEST_EXPR_BINARY_OPERATOR(op, name) \
template <class V> \
auto operator op(const V& rhs2) const \
{ \
return make_expression(*this, rhs2, name{}); /* NOLINT */ \
}
// NOLINTNEXTLINE
#define TEST_EXPR_UNARY_OPERATOR(op, name) \
auto operator op() const { return make_lhs_expression(lhs, name{}); /* NOLINT */ }
template <class T, class U, class Operator>
struct expression
{
T lhs;
U rhs;
friend std::ostream& operator<<(std::ostream& s, const expression& self)
{
s << self.lhs << " " << Operator::as_string() << " " << self.rhs;
return s;
}
friend decltype(auto) get_value(const expression& e) { return e.value(); }
decltype(auto) value() const { return Operator::call(get_value(lhs), get_value(rhs)); };
TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
};
// TODO: Remove rvalue references
template <class T, class U, class Operator>
expression<T, U, Operator> make_expression(T&& rhs, U&& lhs, Operator)
{
return {std::forward<T>(rhs), std::forward<U>(lhs)};
}
// TODO: Remove rvalue reference
template <class T>
lhs_expression<T> make_lhs_expression(T&& lhs)
{
return lhs_expression<T>{std::forward<T>(lhs)};
}
template <class T, class Operator>
lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator)
{
return lhs_expression<T, Operator>{std::forward<T>(lhs)};
}
template <class T, class Operator>
struct lhs_expression
{
T lhs;
explicit lhs_expression(T e) : lhs(e) {}
friend std::ostream& operator<<(std::ostream& s, const lhs_expression& self)
{
std::string op = Operator::as_string();
if(not op.empty())
s << Operator::as_string() << " ";
s << self.lhs;
return s;
}
friend decltype(auto) get_value(const lhs_expression& e) { return e.value(); }
decltype(auto) value() const { return Operator::call(get_value(lhs)); }
TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
// NOLINTNEXTLINE
#define TEST_LHS_REOPERATOR(op) \
template <class U> \
auto operator op(const U& rhs) const \
{ \
return make_lhs_expression(lhs op rhs); \
}
TEST_LHS_REOPERATOR(+)
TEST_LHS_REOPERATOR(-)
TEST_LHS_REOPERATOR(*)
TEST_LHS_REOPERATOR(/)
TEST_LHS_REOPERATOR(%)
TEST_LHS_REOPERATOR(&)
TEST_LHS_REOPERATOR(|)
TEST_LHS_REOPERATOR(^)
};
template <class F>
struct predicate
{
std::string msg;
F f;
friend std::ostream& operator<<(std::ostream& s, const predicate& self)
{
s << self.msg;
return s;
}
decltype(auto) operator()() const { return f(); }
operator decltype(auto)() const { return f(); }
};
template <class F>
auto make_predicate(const std::string& msg, F f)
{
return make_lhs_expression(predicate<F>{msg, f}, function{});
}
inline std::string as_string(bool x)
{
if(x)
return "true";
return "false";
}
template <class T>
std::string as_string(const T& x)
{
std::stringstream ss;
ss << x;
return ss.str();
}
template <class Iterator>
std::string as_string(Iterator start, Iterator last)
{
std::stringstream ss;
stream_range(ss, start, last);
return ss.str();
}
template <class F>
auto make_function(const std::string& name, F f)
{
return [=](auto&&... xs) {
std::vector<std::string> args = {as_string(xs)...};
return make_predicate(name + "(" + as_string(args.begin(), args.end()) + ")",
[=] { return f(xs...); });
};
}
struct capture
{
template <class T>
auto operator->*(const T& x) const
{
return make_lhs_expression(x);
}
template <class T, class Operator>
auto operator->*(const lhs_expression<T, Operator>& x) const
{
return x;
}
};
enum class color
{
reset = 0,
bold = 1,
underlined = 4,
fg_red = 31,
fg_green = 32,
fg_yellow = 33,
fg_blue = 34,
fg_default = 39,
bg_red = 41,
bg_green = 42,
bg_yellow = 43,
bg_blue = 44,
bg_default = 49
};
inline std::ostream& operator<<(std::ostream& os, const color& c)
{
#ifndef _WIN32
static const bool use_color = isatty(STDOUT_FILENO) != 0;
if(use_color)
return os << "\033[" << static_cast<std::size_t>(c) << "m";
#else
(void)c;
#endif
return os;
}
inline std::atomic<int>& failures()
{
// NOLINTNEXTLINE
static std::atomic<int> f = 0;
return f;
}
template <class T, class F>
void failed(T x, const char* msg, const char* func, const char* file, int line, F f)
{
if(not bool(x.value()))
{
failures()++;
std::cout << func << std::endl;
std::cout << file << ":" << line << ":" << std::endl;
std::cout << color::bold << color::fg_red << " FAILED: " << color::reset << msg << " "
<< "[ " << x << " ]" << std::endl;
f();
}
}
template <class F>
bool throws(F f)
{
try
{
f();
return false;
}
catch(...)
{
return true;
}
}
template <class Exception, class F>
bool throws(F f, const std::string& msg = "")
{
try
{
f();
return false;
}
catch(const Exception& ex)
{
return std::string(ex.what()).find(msg) != std::string::npos;
}
}
template <class T, class U>
auto within_abs(T px, U py, double ptol = 1e-6f)
{
return make_function("near", [](auto x, auto y, auto tol) { return std::abs(x - y) < tol; })(
px, py, ptol);
}
// This implements the basic globbing algorithm where `*` matches any number
// of characters(including none) and `?` matches any single character. It
// doesnt support character classes.
//
// This is a simple recursive implementation that scans the string where the
// string and pattern matches. When a `*` is found in the pattern, the
// `glob_match` function is called recursively to compare the rest of the
// pattern to the rest of the string. If the recursive call returns true,
// then we have a match. However, if it returns false, then we advance one
// character and call the recusrsive call again. This is referred to as a
// star-loop, which will consume zero or more characters.
//
// This simple recursive implementation works well for short string and
// patterns with few stars. First, it is unlikely to use many stars to glob
// test names. Secondly, using many stars is still signficantly faster than
// using the equivalent std::regex, which has a much slower time complexity.
template <class Iterator1, class Iterator2>
bool glob_match(Iterator1 start, Iterator1 last, Iterator2 pattern_start, Iterator2 pattern_last)
{
std::tie(start, pattern_start) =
std::mismatch(start, last, pattern_start, pattern_last, [](auto c, auto m) {
if(m == '?')
return true;
// We need a loop for star, so bail and handle the loop below
if(m == '*')
return false;
return c == m;
});
// If there is no more pattern then return true if there is no more string to match
if(pattern_start == pattern_last)
return start == last;
// If the pattern is not a star then its a mismatch
if(*pattern_start != '*')
return false;
// Multiple stars are the same as a single star so skip over multiple stars
pattern_start = std::find_if(pattern_start, pattern_last, [](auto c) { return c != '*'; });
// If the star is at the end then return true
if(pattern_start == pattern_last)
return true;
// star-loop: match the rest of the pattern and text
while(not glob_match(start, last, pattern_start, pattern_last) and start != last)
start++;
// If the string is empty then it means a match was never found
return start != last;
}
using string_map = std::unordered_map<std::string, std::vector<std::string>>;
template <class Keyword>
string_map generic_parse(std::vector<std::string> as, Keyword keyword)
{
string_map result;
std::string flag;
for(auto&& x : as)
{
auto f = keyword(x);
if(f.empty())
{
result[flag].push_back(x);
}
else
{
flag = f.front();
result[flag]; // Ensure the flag exists
flag = f.back();
}
}
return result;
}
using test_case = std::function<void()>;
inline auto& get_test_cases()
{
// NOLINTNEXTLINE
static std::vector<std::pair<std::string, test_case>> cases;
return cases;
}
inline void add_test_case(std::string name, test_case f)
{
get_test_cases().emplace_back(std::move(name), std::move(f));
}
struct auto_register_test_case
{
template <class F>
auto_register_test_case(const char* name, F f) noexcept
{
add_test_case(name, f);
}
};
struct failure_error
{
};
[[noreturn]] inline void fail() { throw failure_error{}; }
struct driver
{
driver()
{
add_flag({"--help", "-h"}, "Show help");
add_flag({"--list", "-l"}, "List all test cases");
add_flag({"--continue", "-c"}, "Continue after failure");
add_flag({"--quiet", "-q"}, "Don't print out extra output");
}
struct argument
{
std::vector<std::string> flags = {};
std::string help = "";
int nargs = 1;
};
void add_arg(const std::vector<std::string>& flags, const std::string& help = "")
{
arguments.push_back(argument{flags, help, 1});
}
void add_flag(const std::vector<std::string>& flags, const std::string& help = "")
{
arguments.push_back(argument{flags, help, 0});
}
static void wrap(std::ostream& os,
const std::string& text,
const std::string& prefix = "",
unsigned int line_length = 80)
{
std::istringstream iss(text);
std::string line = prefix;
do
{
std::string word;
iss >> word;
if(line.length() + word.length() > line_length)
{
os << line << std::endl;
line = prefix;
}
line += word + " ";
} while(iss);
if(not line.empty())
os << line << std::endl;
}
void show_help(const std::string& exe) const
{
const std::string prefix = " ";
std::cout << std::endl;
std::cout << color::fg_yellow << "USAGE:" << color::reset << std::endl;
std::cout << " ";
std::cout << exe << " <test-case>... <options>" << std::endl;
std::cout << std::endl;
std::cout << color::fg_yellow << "ARGS:" << color::reset << std::endl;
std::cout << " ";
std::cout << color::fg_green << "<test-case>..." << color::reset;
std::cout << std::endl;
wrap(std::cout,
"Test cases to run. A test case can be either the exact test case name or a glob. A "
"glob expression uses a '*' to select zero or more characters or a '?' to select any "
"single character.",
prefix + prefix);
std::cout << std::endl;
std::cout << color::fg_yellow << "OPTIONS:" << color::reset << std::endl;
for(auto&& arg : arguments)
{
std::cout << color::fg_green;
std::string arg_prefix = prefix;
for(const std::string& a : arg.flags)
{
std::cout << arg_prefix;
std::cout << a;
arg_prefix = ", ";
}
std::cout << color::reset << std::endl;
wrap(std::cout, arg.help, prefix + prefix);
}
}
std::ostream& out() const
{
struct null_buffer : std::streambuf
{
virtual int overflow(int c) override { return c; }
};
static null_buffer buffer;
static std::ostream null_stream(&buffer);
if(quiet)
return null_stream;
return std::cout;
}
string_map parse(int argc, const char* argv[]) const
{
std::vector<std::string> args(argv + 1, argv + argc);
string_map keys;
for(auto&& arg : arguments)
{
for(auto&& flag : arg.flags)
{
keys[flag] = {arg.flags.front()};
if(arg.nargs == 0)
keys[flag].push_back("");
}
}
auto result = generic_parse(args, [&](auto&& s) -> std::vector<std::string> {
if(keys.count(s) > 0)
return keys[s];
else
return {};
});
result["__exe__"].push_back(argv[0]);
return result;
}
static std::string create_command(const string_map& args)
{
std::stringstream ss;
ss << args.at("__exe__").front();
if(args.count("") > 0)
{
for(auto&& arg : args.at(""))
ss << " \"" << arg << "\"";
}
for(auto&& p : args)
{
if(p.first == "__exe__")
continue;
if(p.first.empty())
continue;
ss << " " << p.first;
for(auto&& arg : p.second)
ss << " \"" << arg << "\"";
}
return ss.str();
}
static std::string fork(const std::string& name, string_map args)
{
std::string msg;
args[""] = {name};
args.erase("--continue");
args["--quiet"];
auto cmd = create_command(args);
auto r = std::system(cmd.c_str()); // NOLINT
if(r != 0)
msg = "Exited with " + std::to_string(r);
return msg;
}
static std::vector<std::pair<std::string, test_case>> glob_tests(const std::string& pattern)
{
std::vector<std::pair<std::string, test_case>> result;
std::copy_if(get_test_cases().begin(),
get_test_cases().end(),
std::back_inserter(result),
[&](auto&& p) {
return glob_match(
p.first.begin(), p.first.end(), pattern.begin(), pattern.end());
});
return result;
}
void run_test_case(const std::string& name, const test_case& f, const string_map& args)
{
ran++;
out() << color::fg_green << "[ RUN ] " << color::reset << color::bold << name
<< color::reset << std::endl;
std::string msg;
auto start = std::chrono::steady_clock::now();
if(args.count("--continue") > 0)
{
msg = fork(name, args);
}
else
{
try
{
failures() = 0;
f();
}
// cppcheck-suppress migraphx-EmptyCatchStatement
catch(const failure_error&)
{
}
}
auto finish = std::chrono::steady_clock::now();
auto elapsed_ms =
std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(finish - start)
.count();
if(msg.empty() and failures() != 0)
{
if(failures() == 1)
msg = "Test failure";
else
msg = std::to_string(failures()) + " test failures";
}
if(msg.empty())
{
out() << color::fg_green << "[ COMPLETE ] " << color::reset;
}
else
{
failed.push_back(name);
out() << color::fg_red << "[ FAILED ] " << color::reset;
}
out() << color::bold << name << color::reset;
out() << color::fg_blue << " (" << elapsed_ms << "ms)" << color::reset;
if(not msg.empty())
out() << ": " << color::fg_yellow << msg << color::reset;
out() << std::endl;
}
void run(int argc, const char* argv[])
{
auto args = parse(argc, argv);
if(args.count("--help") > 0)
{
show_help(args.at("__exe__").front());
return;
}
if(args.count("--list") > 0)
{
for(auto&& tc : get_test_cases())
out() << tc.first << std::endl;
return;
}
if(args.count("--quiet") > 0)
quiet = true;
auto cases = args[""];
if(cases.empty())
{
for(auto&& tc : get_test_cases())
run_test_case(tc.first, tc.second, args);
}
else
{
std::unordered_map<std::string, test_case> m(get_test_cases().begin(),
get_test_cases().end());
for(auto&& iname : cases)
{
std::vector<std::pair<std::string, test_case>> found_cases;
for(auto&& pattern : get_case_names(iname))
{
auto f = m.find(pattern);
if(f == m.end())
{
found_cases = glob_tests(pattern);
}
else
{
found_cases.push_back(*f);
}
}
if(found_cases.empty())
{
out() << color::fg_red << "[ ERROR ] Test case '" << iname << "' not found."
<< color::reset << std::endl;
failed.push_back(iname);
}
for(auto&& p : found_cases)
run_test_case(p.first, p.second, args);
}
}
out() << color::fg_green << "[==========] " << color::fg_yellow << ran << " tests ran"
<< color::reset << std::endl;
if(not failed.empty())
{
out() << color::fg_red << "[ FAILED ] " << color::fg_yellow << failed.size()
<< " tests failed" << color::reset << std::endl;
for(auto&& name : failed)
out() << color::fg_red << "[ FAILED ] " << color::fg_yellow << name
<< color::reset << std::endl;
std::exit(1);
}
}
std::function<std::vector<std::string>(const std::string&)> get_case_names =
[](const std::string& name) -> std::vector<std::string> { return {name}; };
std::vector<argument> arguments = {};
std::vector<std::string> failed = {};
std::size_t ran = 0;
bool quiet = false;
};
inline void run(int argc, const char* argv[])
{
driver d{};
d.run(argc, argv);
}
} // namespace test
// NOLINTNEXTLINE
#define TEST_CAPTURE(...) test::capture{}->*__VA_ARGS__
// NOLINTNEXTLINE
#define CHECK(...) \
test::failed( \
TEST_CAPTURE(__VA_ARGS__), #__VA_ARGS__, __PRETTY_FUNCTION__, __FILE__, __LINE__, [] {})
// NOLINTNEXTLINE
#define EXPECT(...) \
test::failed(TEST_CAPTURE(__VA_ARGS__), \
#__VA_ARGS__, \
__PRETTY_FUNCTION__, \
__FILE__, \
__LINE__, \
&test::fail)
// NOLINTNEXTLINE
#define STATUS(...) EXPECT((__VA_ARGS__) == 0)
// NOLINTNEXTLINE
#define TEST_CAT(x, ...) TEST_PRIMITIVE_CAT(x, __VA_ARGS__)
// NOLINTNEXTLINE
#define TEST_PRIMITIVE_CAT(x, ...) x##__VA_ARGS__
// NOLINTNEXTLINE
#define TEST_CASE_REGISTER(...) \
static test::auto_register_test_case TEST_CAT(register_test_case_, __LINE__) = \
test::auto_register_test_case(#__VA_ARGS__, &__VA_ARGS__);
// NOLINTNEXTLINE
#define TEST_CASE(...) \
void __VA_ARGS__(); \
TEST_CASE_REGISTER(__VA_ARGS__) \
void __VA_ARGS__()
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wglobal-constructors"
#endif
#endif
find_package(hip)
file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
add_library(ck_rtc ${RTC_SOURCES})
target_include_directories(ck_rtc PUBLIC include)
target_link_libraries(ck_rtc PUBLIC hip::host)
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp>
#include <filesystem>
#include <string>
namespace rtc {
struct src_file
{
std::filesystem::path path;
std::string_view content;
};
struct compile_options
{
std::string flags = "";
std::string kernel_name = "main";
};
kernel compile_kernel(const std::vector<src_file>& src,
compile_options options = compile_options{});
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
namespace rtc {
template <class T>
struct buffer
{
buffer() : ptr(), n(0) {}
buffer(std::shared_ptr<T> p, std::size_t sz) : ptr(p), n(sz) {}
buffer(std::shared_ptr<void> p, std::size_t sz)
: ptr(std::reinterpret_pointer_cast<T>(p)), n(sz)
{
}
explicit buffer(std::size_t sz) : ptr(new T[sz]), n(sz) {}
T* begin() { return data(); }
T* end() { return data() + size(); }
const T* begin() const { return data(); }
const T* end() const { return data() + size(); }
T& front() { return data()[0]; }
T& back() { return data()[size() - 1]; }
T& operator[](std::size_t i) { return data()[i]; }
T& at(std::size_t i)
{
if(i >= size())
throw std::runtime_error("Out of bounds");
return data()[i];
}
const T& front() const { return data()[0]; }
const T& back() const { return data()[size() - 1]; }
const T& operator[](std::size_t i) const { return data()[i]; }
const T& at(std::size_t i) const
{
if(i >= size())
throw std::runtime_error("Out of bounds");
return data()[i];
}
const T* data() const { return ptr.get(); }
T* data() { return ptr.get(); }
std::size_t size() const { return n; }
std::size_t bytes() const { return size() * sizeof(T); }
bool empty() const { return size() == 0; }
private:
std::shared_ptr<T> ptr;
std::size_t n;
};
std::string get_device_name();
std::string hip_error(int error);
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false);
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false);
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz);
template <class T>
buffer<T> to_gpu(const buffer<T>& input)
{
return {write_to_gpu(input.data(), input.bytes()), input.size()};
}
template <class T>
buffer<T> from_gpu(const buffer<T>& input)
{
return {read_from_gpu(input.data(), input.bytes()), input.size()};
}
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
#include <vector>
namespace rtc {
struct kernel_argument
{
template <class T,
class U = std::remove_reference_t<T>,
class = std::enable_if_t<not std::is_base_of<kernel_argument, T>{}>>
kernel_argument(T&& x) : size(sizeof(U)), align(alignof(U)), data(&x) // NOLINT
{
}
std::size_t size;
std::size_t align;
void* data;
};
std::vector<char> pack_args(const std::vector<kernel_argument>& args);
struct kernel_impl;
struct kernel
{
kernel() = default;
kernel(const char* image, const std::string& name);
template <class T>
kernel(const std::vector<T>& image, const std::string& name)
: kernel(reinterpret_cast<const char*>(image.data()), name)
{
static_assert(sizeof(T) == 1, "Only byte types");
}
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const;
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const;
template <class... Ts>
auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
{
return [=](auto&&... xs) {
launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
};
}
private:
std::shared_ptr<kernel_impl> impl;
};
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#include <type_traits>
#include <memory>
namespace rtc {
template <class F, F f>
struct manage_deleter
{
template <class T>
void operator()(T* x) const
{
if(x != nullptr)
{
(void)f(x);
}
}
};
struct null_deleter
{
template <class T>
void operator()(T*) const
{
}
};
template <class T, class F, F f>
using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
template <class T>
struct element_type
{
using type = typename T::element_type;
};
template <class T>
using remove_ptr = typename std::
conditional_t<std::is_pointer<T>{}, std::remove_pointer<T>, element_type<T>>::type;
template <class T>
using shared = std::shared_ptr<remove_ptr<T>>;
template <class T>
shared<T> share(T p)
{
return shared<T>{std::move(p)};
}
#define RTC_MANAGE_PTR(T, F) rtc::manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F>
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <filesystem>
namespace rtc {
struct tmp_dir
{
std::filesystem::path path;
tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const;
tmp_dir(tmp_dir const&) = delete;
tmp_dir& operator=(tmp_dir const&) = delete;
~tmp_dir();
};
} // namespace rtc
#endif
#include "rtc/hip.hpp"
#include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp>
#include <stdexcept>
#include <iostream>
#include <fstream>
#include <cassert>
namespace rtc {
template <class T>
T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
{
std::ifstream is(filename, std::ios::binary | std::ios::ate);
if(nbytes == 0)
{
// if there is a non-zero offset and nbytes is not set,
// calculate size of remaining bytes to read
nbytes = is.tellg();
if(offset > nbytes)
throw std::runtime_error("offset is larger than file size");
nbytes -= offset;
}
if(nbytes < 1)
throw std::runtime_error("Invalid size for: " + filename);
is.seekg(offset, std::ios::beg);
T buffer(nbytes, 0);
if(not is.read(&buffer[0], nbytes))
throw std::runtime_error("Error reading file: " + filename);
return buffer;
}
std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
{
return generic_read_file<std::vector<char>>(filename, offset, nbytes);
}
std::string read_string(const std::string& filename)
{
return generic_read_file<std::string>(filename);
}
void write_buffer(const std::string& filename, const char* buffer, std::size_t size)
{
std::ofstream os(filename);
os.write(buffer, size);
}
void write_buffer(const std::string& filename, const std::vector<char>& buffer)
{
write_buffer(filename, buffer.data(), buffer.size());
}
void write_string(const std::string& filename, const std::string_view& buffer)
{
write_buffer(filename, buffer.data(), buffer.size());
}
std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
{
assert(not srcs.empty());
tmp_dir td{"compile"};
options.flags += " -I. -O3";
options.flags += " -std=c++17";
options.flags += " --offload-arch=" + get_device_name();
std::string out;
for(const auto& src : srcs)
{
std::filesystem::path full_path = td.path / src.path;
std::filesystem::path parent_path = full_path.parent_path();
std::filesystem::create_directories(parent_path);
write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp")
{
options.flags += " -c " + src.path.filename().string();
if(out.empty())
out = src.path.stem().string() + ".o";
}
}
options.flags += " -o " + out;
td.execute(compiler() + options.flags);
auto out_path = td.path / out;
if(not std::filesystem::exists(out_path))
throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string());
return kernel{obj.data(), options.kernel_name};
}
} // namespace rtc
#include <rtc/hip.hpp>
#include <rtc/manage_ptr.hpp>
#include <stdexcept>
#include <cassert>
namespace rtc {
using hip_ptr = RTC_MANAGE_PTR(void, hipFree);
std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
int get_device_id()
{
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
throw std::runtime_error("No device");
return device;
}
std::string get_device_name()
{
hipDeviceProp_t props{};
auto status = hipGetDeviceProperties(&props, get_device_id());
if(status != hipSuccess)
throw std::runtime_error("Failed to get device properties");
return props.gcnArchName;
}
bool is_device_ptr(const void* ptr)
{
hipPointerAttribute_t attr;
auto status = hipPointerGetAttributes(&attr, ptr);
if(status != hipSuccess)
return false;
return attr.type == hipMemoryTypeDevice;
}
void gpu_sync()
{
auto status = hipDeviceSynchronize();
if(status != hipSuccess)
throw std::runtime_error("hip device synchronization failed: " + hip_error(status));
}
std::size_t get_available_gpu_memory()
{
size_t free;
size_t total;
auto status = hipMemGetInfo(&free, &total);
if(status != hipSuccess)
throw std::runtime_error("Failed getting available memory: " + hip_error(status));
return free;
}
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host)
{
if(sz > get_available_gpu_memory())
throw std::runtime_error("Memory not available to allocate buffer: " + std::to_string(sz));
void* alloc_ptr = nullptr;
auto status = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
if(status != hipSuccess)
{
if(host)
throw std::runtime_error("Gpu allocation failed: " + hip_error(status));
else
return allocate_gpu(sz, true);
}
assert(alloc_ptr != nullptr);
std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
return result;
}
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host)
{
gpu_sync();
auto result = allocate_gpu(sz, host);
assert(is_device_ptr(result.get()));
assert(not is_device_ptr(x));
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess)
throw std::runtime_error("Copy to gpu failed: " + hip_error(status));
return result;
}
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz)
{
gpu_sync();
std::shared_ptr<char> result(new char[sz]);
assert(not is_device_ptr(result.get()));
if(not is_device_ptr(x))
{
throw std::runtime_error(
"read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
}
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyDeviceToHost);
if(status != hipSuccess)
throw std::runtime_error("Copy from gpu failed: " + hip_error(status)); // NOLINT
return std::static_pointer_cast<void>(result);
}
} // namespace rtc
#include <rtc/kernel.hpp>
#include <rtc/manage_ptr.hpp>
#include <rtc/hip.hpp>
#include <cassert>
// extern declare the function since hip/hip_ext.h header is broken
extern hipError_t hipExtModuleLaunchKernel(hipFunction_t, // NOLINT
uint32_t,
uint32_t,
uint32_t,
uint32_t,
uint32_t,
uint32_t,
size_t,
hipStream_t,
void**,
void**,
hipEvent_t = nullptr,
hipEvent_t = nullptr,
uint32_t = 0);
namespace rtc {
std::vector<char> pack_args(const std::vector<kernel_argument>& args)
{
std::vector<char> kernargs;
for(auto&& arg : args)
{
std::size_t n = arg.size;
const auto* p = static_cast<const char*>(arg.data);
// Insert padding
std::size_t padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;
kernargs.insert(kernargs.end(), padding, 0);
kernargs.insert(kernargs.end(), p, p + n);
}
return kernargs;
}
using hip_module_ptr = RTC_MANAGE_PTR(hipModule_t, hipModuleUnload);
struct kernel_impl
{
hip_module_ptr module = nullptr;
hipFunction_t fun = nullptr;
};
hip_module_ptr load_module(const char* image)
{
hipModule_t raw_m;
auto status = hipModuleLoadData(&raw_m, image);
hip_module_ptr m{raw_m};
if(status != hipSuccess)
throw std::runtime_error("Failed to load module: " + hip_error(status));
return m;
}
kernel::kernel(const char* image, const std::string& name) : impl(std::make_shared<kernel_impl>())
{
impl->module = load_module(image);
auto status = hipModuleGetFunction(&impl->fun, impl->module.get(), name.c_str());
if(hipSuccess != status)
throw std::runtime_error("Failed to get function: " + name + ": " + hip_error(status));
}
void launch_kernel(hipFunction_t fun,
hipStream_t stream,
std::size_t global,
std::size_t local,
void* kernargs,
std::size_t size)
{
assert(global > 0);
assert(local > 0);
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
kernargs,
HIP_LAUNCH_PARAM_BUFFER_SIZE,
&size,
HIP_LAUNCH_PARAM_END};
auto status = hipExtModuleLaunchKernel(fun,
global,
1,
1,
local,
1,
1,
0,
stream,
nullptr,
reinterpret_cast<void**>(&config),
nullptr,
nullptr);
if(status != hipSuccess)
throw std::runtime_error("Failed to launch kernel: " + hip_error(status));
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const
{
assert(impl != nullptr);
void* kernargs = args.data();
std::size_t size = args.size() * sizeof(void*);
launch_kernel(impl->fun, stream, global, local, kernargs, size);
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const
{
assert(impl != nullptr);
std::vector<char> kernargs = pack_args(args);
std::size_t size = kernargs.size();
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
}
} // namespace rtc
\ No newline at end of file
#include <rtc/tmp_dir.hpp>
#include <algorithm>
#include <random>
#include <thread>
#include <unistd.h>
namespace rtc {
std::string random_string(std::string::size_type length)
{
static const std::string& chars = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::mt19937 rg{std::random_device{}()};
std::uniform_int_distribution<std::string::size_type> pick(0, chars.length() - 1);
std::string str(length, 0);
std::generate(str.begin(), str.end(), [&] { return chars[pick(rg)]; });
return str;
}
std::string unique_string(const std::string& prefix)
{
auto pid = getpid();
auto tid = std::this_thread::get_id();
auto clk = std::chrono::steady_clock::now().time_since_epoch().count();
std::stringstream ss;
ss << std::hex << prefix << "-" << pid << "-" << tid << "-" << clk << "-" << random_string(16);
return ss.str();
}
tmp_dir::tmp_dir(const std::string& prefix)
: path(std::filesystem::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{
std::filesystem::create_directories(this->path);
}
void tmp_dir::execute(const std::string& cmd) const
{
std::string s = "cd " + path.string() + "; " + cmd;
std::system(s.c_str());
}
tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); }
} // namespace rtc
\ No newline at end of file
......@@ -36,7 +36,7 @@ What is inside the image?
The docker images have everything you need for running CK including:
* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_
* `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_
* `CMake <https://cmake.org/getting-started/>`_
* `Compiler <https://github.com/ROCm/llvm-project>`_
* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
......
rocm-docs-core==0.35.0
rocm-docs-core==0.35.1
sphinxcontrib-bibtex==2.6.2
......@@ -113,7 +113,7 @@ requests==2.31.0
# via
# pygithub
# sphinx
rocm-docs-core==0.35.0
rocm-docs-core==0.35.1
# via -r requirements.in
six==1.16.0
# via
......
......@@ -20,14 +20,18 @@ using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto LoopSched = ck::make_default_loop_scheduler();
static constexpr auto PipelineVer = ck::PipelineVersion::v1;
using ComputeTypeA = ck::f8_t;
using ComputeTypeB = ck::f8_t;
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>;
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Loop| Pipeline| Compute| Compute|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler| Version| TypeA| TypeB|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
......
......@@ -27,10 +27,10 @@ using ComputeTypeB = ck::bf8_t;
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Loop| Pipeline| Compute| Compute|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler| Version| TypeA| TypeB|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// clang-format on
......
......@@ -498,94 +498,95 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
}
};
static bool IsSupportedArgument(const Argument& arg)
static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_)
{
if(!ck::is_xdl_supported())
{
return false;
}
// check vector load/store
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
// check vector load of A
if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
{
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
// check vector load of A
if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
{
if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
{
return false;
}
}
else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
{
// FIXME: not rigorous
if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
{
return false;
}
}
else
if(KRaw_ % ABlockTransferSrcScalarPerVector != 0)
{
return false;
}
// check vector laod of B
if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
}
else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
{
// FIXME: not rigorous
if(MRaw_ % ABlockTransferSrcScalarPerVector != 0)
{
if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
{
return false;
}
return false;
}
else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
}
else
{
return false;
}
// check vector laod of B
if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
{
if(KRaw_ % BBlockTransferSrcScalarPerVector != 0)
{
// FIXME: not rigorous
if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
{
return false;
}
return false;
}
else
}
else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
{
// FIXME: not rigorous
if(NRaw_ % BBlockTransferSrcScalarPerVector != 0)
{
return false;
}
}
else
{
return false;
}
// check vector load of Ds
// only support RowMajor for now
bool all_valid = true;
static_for<0, NumDTensor, 1>{}([&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
// check vector load of Ds
// only support RowMajor for now
bool all_valid = true;
if constexpr(!is_same_v<DLayout, Row>)
{
all_valid = false;
}
});
static_for<0, NumDTensor, 1>{}([&](auto i) {
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
if(!all_valid)
if constexpr(!is_same_v<DLayout, Row>)
{
return false;
all_valid = false;
}
});
// check vector store of E
// only support RowMajor for now
if constexpr(is_same_v<ELayout, Row>)
{
if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
{
return false;
}
}
else
if(!all_valid)
{
return false;
}
// check vector store of E
// only support RowMajor for now
if constexpr(is_same_v<ELayout, Row>)
{
if(NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
{
return false;
}
}
else
{
return false;
}
return true;
}
return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
static bool IsSupportedArgument(const Argument& arg)
{
if(!ck::is_xdl_supported())
{
return false;
}
return IsSupported(arg.MRaw_, arg.NRaw_, arg.KRaw_) and
GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
arg.b_grid_desc_n_k_,
arg.ds_grid_desc_m_n_,
arg.e_grid_desc_m_n_,
......@@ -708,6 +709,178 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return str.str();
}
template <class ADesc, class BDesc, class DsDesc, class EDesc>
struct Descriptor
{
static constexpr auto ds_tuple()
{
return transform_tuples(
[&](auto d) constexpr { return DeviceOp::matrix_padder.PadCDescriptor_M_N(d); },
DsDesc{});
}
using AGridDesc_M_K =
remove_cvref_t<decltype(DeviceOp::matrix_padder.PadADescriptor_M_K(ADesc{}))>;
using BGridDesc_N_K =
remove_cvref_t<decltype(DeviceOp::matrix_padder.PadBDescriptor_N_K(BDesc{}))>;
using DsGridDesc_M_N = remove_cvref_t<decltype(ds_tuple())>;
using EGridDesc_M_N =
remove_cvref_t<decltype(DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{}))>;
using AGridDesc_AK0_M_AK1 =
remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
DeviceOp::matrix_padder.PadADescriptor_M_K(ADesc{})))>;
using BGridDesc_BK0_N_BK1 =
remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
DeviceOp::matrix_padder.PadBDescriptor_N_K(BDesc{})))>;
using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
ds_tuple()))>;
using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{})))>;
using Block2ETileMap = remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(
DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{})))>;
// tensor descriptors for problem definiton
AGridDesc_M_K a_grid_desc_m_k;
BGridDesc_N_K b_grid_desc_n_k;
DsGridDesc_M_N ds_grid_desc_m_n;
EGridDesc_M_N e_grid_desc_m_n;
// tensor descriptors for block/thread-wise copy
AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1;
BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1;
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock;
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock;
// block-to-e-tile map
Block2ETileMap block_2_etile_map;
// element-wise op
AElementwiseOperation a_element_op;
BElementwiseOperation b_element_op;
CDEElementwiseOperation cde_element_op;
// for checking vector load/store
index_t MRaw;
index_t NRaw;
index_t KRaw;
bool has_main_k_block_loop = true;
constexpr Descriptor(ADesc a,
BDesc b,
DsDesc ds,
EDesc e,
AElementwiseOperation a_element_op_,
BElementwiseOperation b_element_op_,
CDEElementwiseOperation cde_element_op_)
: a_grid_desc_m_k{DeviceOp::matrix_padder.PadADescriptor_M_K(a)},
b_grid_desc_n_k{DeviceOp::matrix_padder.PadBDescriptor_N_K(b)},
ds_grid_desc_m_n{transform_tuples(
[&](auto d) constexpr { return DeviceOp::matrix_padder.PadCDescriptor_M_N(d); },
ds)},
e_grid_desc_m_n{DeviceOp::matrix_padder.PadCDescriptor_M_N(e)},
a_grid_desc_ak0_m_ak1{
GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k)},
b_grid_desc_bk0_n_bk1{
GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k)},
ds_grid_desc_mblock_mperblock_nblock_nperblock{
GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
transform_tuples(
[&](auto d) constexpr {
return DeviceOp::matrix_padder.PadCDescriptor_M_N(d);
},
ds))},
e_grid_desc_mblock_mperblock_nblock_nperblock{
GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
e_grid_desc_m_n)},
block_2_etile_map{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n)},
has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
a_element_op{a_element_op_},
b_element_op{b_element_op_},
cde_element_op{cde_element_op_},
MRaw{e.GetLength(I0)},
NRaw{e.GetLength(I1)},
KRaw{a.GetLength(I1)}
{
}
constexpr bool IsValid() const
{
return GridwiseGemm::CheckValidity(a_grid_desc_m_k,
b_grid_desc_n_k,
ds_grid_desc_m_n,
e_grid_desc_m_n,
block_2_etile_map) and
IsSupported(MRaw, NRaw, KRaw);
}
constexpr index_t GetBlockSize() const { return BlockSize; }
constexpr index_t GetGridSize() const
{
return block_2_etile_map.CalculateGridSize(e_grid_desc_m_n);
}
};
template <class ADesc, class BDesc, class DsDesc, class EDesc>
static constexpr auto
make_descriptor(ADesc a,
BDesc b,
DsDesc ds,
EDesc e,
AElementwiseOperation a_element_op = AElementwiseOperation{},
BElementwiseOperation b_element_op = BElementwiseOperation{},
CDEElementwiseOperation cde_element_op = CDEElementwiseOperation{})
{
return Descriptor<ADesc, BDesc, DsDesc, EDesc>(
a, b, ds, e, a_element_op, b_element_op, cde_element_op);
}
template <class Desc, class DsPointer>
__device__ static void Run(const Desc& desc,
const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType* __restrict__ p_e_grid)
{
__shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
assert(desc.IsValid());
if(desc.has_main_k_block_loop)
{
GridwiseGemm::template Run<true>(p_a_grid,
p_b_grid,
p_ds_grid,
p_e_grid,
p_shared_block,
desc.a_element_op,
desc.b_element_op,
desc.cde_element_op,
desc.a_grid_desc_ak0_m_ak1,
desc.b_grid_desc_bk0_n_bk1,
desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
desc.block_2_etile_map);
}
else
{
GridwiseGemm::template Run<false>(p_a_grid,
p_b_grid,
p_ds_grid,
p_e_grid,
p_shared_block,
desc.a_element_op,
desc.b_element_op,
desc.cde_element_op,
desc.a_grid_desc_ak0_m_ak1,
desc.b_grid_desc_bk0_n_bk1,
desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
desc.block_2_etile_map);
}
}
};
} // namespace device
......
......@@ -24,10 +24,10 @@ struct BlockToCTileMap_M00_N0_M01
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
__host__ __device__ BlockToCTileMap_M00_N0_M01() = default;
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01() = default;
__host__ __device__ BlockToCTileMap_M00_N0_M01(const CGridDesc_M_N& c_grid_desc_m_n,
index_t M01 = 1)
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01(const CGridDesc_M_N& c_grid_desc_m_n,
index_t M01 = 1)
: M01_(M01), underlying_map_(GetBlockToCTileMap(c_grid_desc_m_n, M01))
{
}
......@@ -51,8 +51,8 @@ struct BlockToCTileMap_M00_N0_M01
}
template <typename CTileIdx, typename CTileDim>
__host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
const CTileDim& c_tile_dim) const
__host__ __device__ constexpr bool ValidCTileIndex(const CTileIdx& c_tile_idx,
const CTileDim& c_tile_dim) const
{
if constexpr(DeviceCTileIndexCheck)
return DefaultValidCTileIndex(c_tile_idx, c_tile_dim);
......@@ -60,7 +60,7 @@ struct BlockToCTileMap_M00_N0_M01
return true;
}
__host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
{
if constexpr(DeviceCTileIndexCheck)
return true; // validity check moved to kernel
......@@ -120,18 +120,19 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt() = default;
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt() = default;
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const BlockToCTileMap_M00_N0_M01Adapt&) =
default;
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(BlockToCTileMap_M00_N0_M01Adapt&&) =
default;
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt&
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(
const BlockToCTileMap_M00_N0_M01Adapt&) = default;
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(
BlockToCTileMap_M00_N0_M01Adapt&&) = default;
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt&
operator=(const BlockToCTileMap_M00_N0_M01Adapt&) = default;
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt&
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt&
operator=(BlockToCTileMap_M00_N0_M01Adapt&&) = default;
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(index_t M, index_t N, index_t M01 = 8)
__host__
__device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(index_t M, index_t N, index_t M01 = 8)
: M_(M), N_(N), M01_(M01)
{
#if 0
......@@ -142,8 +143,9 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
}
template <typename CGridDesc_M_N>
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
index_t M01 = 8)
__host__
__device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
index_t M01 = 8)
: BlockToCTileMap_M00_N0_M01Adapt(
c_grid_desc_m_n.GetLength(I0), c_grid_desc_m_n.GetLength(I1), M01)
{
......@@ -164,7 +166,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
}
template <typename CGridDesc_M_N>
__host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
{
return true;
}
......@@ -237,8 +239,8 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
}
template <typename CTileIdx, typename CTileDim>
__host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
const CTileDim& /* c_tile_dim */) const
__host__ __device__ constexpr bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
const CTileDim& /* c_tile_dim */) const
{
return true; // always valid provided that user gets grid size from CalculateGridSize()
}
......@@ -616,7 +618,10 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
return true; // always valid provided that user gets grid size from CalculateGridSize()
}
__host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
{
return true;
}
private:
index_t M01_;
......@@ -674,7 +679,7 @@ struct BlockToCTileMap_M00_N00_M01_N01
return true;
}
__host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
{
if constexpr(DeviceCTileIndexCheck)
return true; // validity check moved to kernel
......@@ -786,7 +791,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
return true;
}
__host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
{
if constexpr(DeviceCTileIndexCheck)
return true; // validity check moved to kernel
......@@ -910,7 +915,7 @@ struct OffsettedBlockToCTileMap
}
template <typename CGridDesc_M_N>
__host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
{
return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n);
}
......@@ -967,7 +972,7 @@ struct BlockToCTileMap_3DGrid_KSplit
}
template <typename CGridDesc_M_N>
__host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
__host__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
{
return true;
}
......
......@@ -264,7 +264,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
const BGridDesc_N_K& b_grid_desc_n_k,
const DsGridDesc_M_N& ds_grid_desc_m_n,
const EGridDesc_M_N& e_grid_desc_m_n,
const Block2ETileMap& block_2_etile_map)
const Block2ETileMap&)
{
static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
(NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
......@@ -310,10 +310,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
}
// check block-to-E-tile
if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
{
return false;
}
// if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
//{
// return false;
//}
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
// check tensor size: cannot be larger than 2GB each
......
......@@ -166,9 +166,6 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
{
constexpr int seed = 1254739;
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
float max_fp8 = 240.0f;
if(!std::isinf(x))
x = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
#if defined(__gfx94__)
union
{
......@@ -176,10 +173,15 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
uint32_t i32val;
uint8_t i8val[4]; // not endian independent
} val;
val.fval = x;
uint32_t ival = 0;
ival = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
val.i32val = ival;
val.fval = x;
uint32_t ival = 0;
const float max_fp8 = 240.0f;
// if x is not +/- infinity or nan
if((val.i32val & NumericUtils<float>::nan_mask) != NumericUtils<float>::Inf)
// clip float value
val.fval = __builtin_amdgcn_fmed3f(val.fval, max_fp8, -max_fp8);
ival = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
val.i32val = ival;
return val.i8val[0]; // little endian
#else
constexpr bool negative_zero_nan = true;
......@@ -223,10 +225,15 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
uint32_t i32val;
uint8_t i8val[4]; // not endian independent
} val;
val.fval = x;
uint32_t ival = 0;
ival = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
val.i32val = ival;
val.fval = x;
uint32_t ival = 0;
const float max_bf8 = 57344.0f;
// if x is not +/- infinity or nan
if((val.i32val & NumericUtils<float>::nan_mask) != NumericUtils<float>::Inf)
// clip float value
val.fval = __builtin_amdgcn_fmed3f(val.fval, max_bf8, -max_bf8);
ival = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
val.i32val = ival;
return val.i8val[0]; // little endian
#else
constexpr bool negative_zero_nan = true;
......@@ -265,9 +272,6 @@ __host__ __device__ constexpr Y f8_convert_rne(X x);
template <>
inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
{
float max_fp8 = 240.0f;
if(!std::isinf(x))
x = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
#if defined(__gfx94__)
union
{
......@@ -275,8 +279,13 @@ inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
uint32_t i32val;
uint8_t i8val[4]; // not endian independent
} val;
val.fval = x;
uint32_t ival = 0;
val.fval = x;
uint32_t ival = 0;
const float max_fp8 = 240.0f;
// if x is not +/- infinity or nan
if((val.i32val & NumericUtils<float>::nan_mask) != NumericUtils<float>::Inf)
// clip float value
val.fval = __builtin_amdgcn_fmed3f(val.fval, max_fp8, -max_fp8);
ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false); // false -> WORD0
val.i32val = ival;
return val.i8val[0];
......@@ -320,8 +329,13 @@ inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
uint32_t i32val;
uint8_t i8val[4]; // not endian independent
} val;
val.fval = x;
uint32_t ival = 0;
val.fval = x;
uint32_t ival = 0;
const float max_bf8 = 57344.0f;
// if x is not +/- infinity or nan
if((val.i32val & NumericUtils<float>::nan_mask) != NumericUtils<float>::Inf)
// clip float value
val.fval = __builtin_amdgcn_fmed3f(val.fval, max_bf8, -max_bf8);
ival = __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0
val.i32val = ival;
return val.i8val[0];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment