Commit 3e4d0ff3 authored by Jakub Piasecki's avatar Jakub Piasecki
Browse files

Merge remote-tracking branch 'origin/develop' into ggemm_multid_two_stage

parents 1ad29336 9e011bcd
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
#include <vector>
namespace rtc {
struct kernel_argument
{
template <class T,
class U = std::remove_reference_t<T>,
class = std::enable_if_t<not std::is_base_of<kernel_argument, T>{}>>
kernel_argument(T&& x) : size(sizeof(U)), align(alignof(U)), data(&x) // NOLINT
{
}
std::size_t size;
std::size_t align;
void* data;
};
std::vector<char> pack_args(const std::vector<kernel_argument>& args);
struct kernel_impl;
struct kernel
{
kernel() = default;
kernel(const char* image, const std::string& name);
template <class T>
kernel(const std::vector<T>& image, const std::string& name)
: kernel(reinterpret_cast<const char*>(image.data()), name)
{
static_assert(sizeof(T) == 1, "Only byte types");
}
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const;
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const;
template <class... Ts>
auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
{
return [=](auto&&... xs) {
launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
};
}
private:
std::shared_ptr<kernel_impl> impl;
};
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#include <type_traits>
#include <memory>
namespace rtc {
template <class F, F f>
struct manage_deleter
{
template <class T>
void operator()(T* x) const
{
if(x != nullptr)
{
(void)f(x);
}
}
};
struct null_deleter
{
template <class T>
void operator()(T*) const
{
}
};
template <class T, class F, F f>
using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
template <class T>
struct element_type
{
using type = typename T::element_type;
};
template <class T>
using remove_ptr = typename std::
conditional_t<std::is_pointer<T>{}, std::remove_pointer<T>, element_type<T>>::type;
template <class T>
using shared = std::shared_ptr<remove_ptr<T>>;
template <class T>
shared<T> share(T p)
{
return shared<T>{std::move(p)};
}
#define RTC_MANAGE_PTR(T, F) rtc::manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F>
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <filesystem>
namespace rtc {
struct tmp_dir
{
std::filesystem::path path;
tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const;
tmp_dir(tmp_dir const&) = delete;
tmp_dir& operator=(tmp_dir const&) = delete;
~tmp_dir();
};
} // namespace rtc
#endif
#include "rtc/hip.hpp"
#include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp>
#include <stdexcept>
#include <iostream>
#include <fstream>
#include <cassert>
namespace rtc {
template <class T>
T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
{
std::ifstream is(filename, std::ios::binary | std::ios::ate);
if(nbytes == 0)
{
// if there is a non-zero offset and nbytes is not set,
// calculate size of remaining bytes to read
nbytes = is.tellg();
if(offset > nbytes)
throw std::runtime_error("offset is larger than file size");
nbytes -= offset;
}
if(nbytes < 1)
throw std::runtime_error("Invalid size for: " + filename);
is.seekg(offset, std::ios::beg);
T buffer(nbytes, 0);
if(not is.read(&buffer[0], nbytes))
throw std::runtime_error("Error reading file: " + filename);
return buffer;
}
std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
{
return generic_read_file<std::vector<char>>(filename, offset, nbytes);
}
std::string read_string(const std::string& filename)
{
return generic_read_file<std::string>(filename);
}
void write_buffer(const std::string& filename, const char* buffer, std::size_t size)
{
std::ofstream os(filename);
os.write(buffer, size);
}
void write_buffer(const std::string& filename, const std::vector<char>& buffer)
{
write_buffer(filename, buffer.data(), buffer.size());
}
void write_string(const std::string& filename, const std::string_view& buffer)
{
write_buffer(filename, buffer.data(), buffer.size());
}
std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
{
assert(not srcs.empty());
tmp_dir td{"compile"};
options.flags += " -I. -O3";
options.flags += " -std=c++17";
options.flags += " --offload-arch=" + get_device_name();
std::string out;
for(const auto& src : srcs)
{
std::filesystem::path full_path = td.path / src.path;
std::filesystem::path parent_path = full_path.parent_path();
std::filesystem::create_directories(parent_path);
write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp")
{
options.flags += " -c " + src.path.filename().string();
if(out.empty())
out = src.path.stem().string() + ".o";
}
}
options.flags += " -o " + out;
td.execute(compiler() + options.flags);
auto out_path = td.path / out;
if(not std::filesystem::exists(out_path))
throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string());
return kernel{obj.data(), options.kernel_name};
}
} // namespace rtc
#include <rtc/hip.hpp>
#include <rtc/manage_ptr.hpp>
#include <stdexcept>
#include <cassert>
namespace rtc {
using hip_ptr = RTC_MANAGE_PTR(void, hipFree);
std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
int get_device_id()
{
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
throw std::runtime_error("No device");
return device;
}
std::string get_device_name()
{
hipDeviceProp_t props{};
auto status = hipGetDeviceProperties(&props, get_device_id());
if(status != hipSuccess)
throw std::runtime_error("Failed to get device properties");
return props.gcnArchName;
}
bool is_device_ptr(const void* ptr)
{
hipPointerAttribute_t attr;
auto status = hipPointerGetAttributes(&attr, ptr);
if(status != hipSuccess)
return false;
return attr.type == hipMemoryTypeDevice;
}
void gpu_sync()
{
auto status = hipDeviceSynchronize();
if(status != hipSuccess)
throw std::runtime_error("hip device synchronization failed: " + hip_error(status));
}
std::size_t get_available_gpu_memory()
{
size_t free;
size_t total;
auto status = hipMemGetInfo(&free, &total);
if(status != hipSuccess)
throw std::runtime_error("Failed getting available memory: " + hip_error(status));
return free;
}
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host)
{
if(sz > get_available_gpu_memory())
throw std::runtime_error("Memory not available to allocate buffer: " + std::to_string(sz));
void* alloc_ptr = nullptr;
auto status = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
if(status != hipSuccess)
{
if(host)
throw std::runtime_error("Gpu allocation failed: " + hip_error(status));
else
return allocate_gpu(sz, true);
}
assert(alloc_ptr != nullptr);
std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
return result;
}
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host)
{
gpu_sync();
auto result = allocate_gpu(sz, host);
assert(is_device_ptr(result.get()));
assert(not is_device_ptr(x));
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess)
throw std::runtime_error("Copy to gpu failed: " + hip_error(status));
return result;
}
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz)
{
gpu_sync();
std::shared_ptr<char> result(new char[sz]);
assert(not is_device_ptr(result.get()));
if(not is_device_ptr(x))
{
throw std::runtime_error(
"read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
}
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyDeviceToHost);
if(status != hipSuccess)
throw std::runtime_error("Copy from gpu failed: " + hip_error(status)); // NOLINT
return std::static_pointer_cast<void>(result);
}
} // namespace rtc
#include <rtc/kernel.hpp>
#include <rtc/manage_ptr.hpp>
#include <rtc/hip.hpp>
#include <cassert>
// extern declare the function since hip/hip_ext.h header is broken
extern hipError_t hipExtModuleLaunchKernel(hipFunction_t, // NOLINT
uint32_t,
uint32_t,
uint32_t,
uint32_t,
uint32_t,
uint32_t,
size_t,
hipStream_t,
void**,
void**,
hipEvent_t = nullptr,
hipEvent_t = nullptr,
uint32_t = 0);
namespace rtc {
std::vector<char> pack_args(const std::vector<kernel_argument>& args)
{
std::vector<char> kernargs;
for(auto&& arg : args)
{
std::size_t n = arg.size;
const auto* p = static_cast<const char*>(arg.data);
// Insert padding
std::size_t padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;
kernargs.insert(kernargs.end(), padding, 0);
kernargs.insert(kernargs.end(), p, p + n);
}
return kernargs;
}
using hip_module_ptr = RTC_MANAGE_PTR(hipModule_t, hipModuleUnload);
struct kernel_impl
{
hip_module_ptr module = nullptr;
hipFunction_t fun = nullptr;
};
hip_module_ptr load_module(const char* image)
{
hipModule_t raw_m;
auto status = hipModuleLoadData(&raw_m, image);
hip_module_ptr m{raw_m};
if(status != hipSuccess)
throw std::runtime_error("Failed to load module: " + hip_error(status));
return m;
}
kernel::kernel(const char* image, const std::string& name) : impl(std::make_shared<kernel_impl>())
{
impl->module = load_module(image);
auto status = hipModuleGetFunction(&impl->fun, impl->module.get(), name.c_str());
if(hipSuccess != status)
throw std::runtime_error("Failed to get function: " + name + ": " + hip_error(status));
}
void launch_kernel(hipFunction_t fun,
hipStream_t stream,
std::size_t global,
std::size_t local,
void* kernargs,
std::size_t size)
{
assert(global > 0);
assert(local > 0);
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
kernargs,
HIP_LAUNCH_PARAM_BUFFER_SIZE,
&size,
HIP_LAUNCH_PARAM_END};
auto status = hipExtModuleLaunchKernel(fun,
global,
1,
1,
local,
1,
1,
0,
stream,
nullptr,
reinterpret_cast<void**>(&config),
nullptr,
nullptr);
if(status != hipSuccess)
throw std::runtime_error("Failed to launch kernel: " + hip_error(status));
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const
{
assert(impl != nullptr);
void* kernargs = args.data();
std::size_t size = args.size() * sizeof(void*);
launch_kernel(impl->fun, stream, global, local, kernargs, size);
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const
{
assert(impl != nullptr);
std::vector<char> kernargs = pack_args(args);
std::size_t size = kernargs.size();
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
}
} // namespace rtc
\ No newline at end of file
#include <rtc/tmp_dir.hpp>
#include <algorithm>
#include <random>
#include <thread>
#include <unistd.h>
namespace rtc {
std::string random_string(std::string::size_type length)
{
static const std::string& chars = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::mt19937 rg{std::random_device{}()};
std::uniform_int_distribution<std::string::size_type> pick(0, chars.length() - 1);
std::string str(length, 0);
std::generate(str.begin(), str.end(), [&] { return chars[pick(rg)]; });
return str;
}
std::string unique_string(const std::string& prefix)
{
auto pid = getpid();
auto tid = std::this_thread::get_id();
auto clk = std::chrono::steady_clock::now().time_since_epoch().count();
std::stringstream ss;
ss << std::hex << prefix << "-" << pid << "-" << tid << "-" << clk << "-" << random_string(16);
return ss.str();
}
tmp_dir::tmp_dir(const std::string& prefix)
: path(std::filesystem::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{
std::filesystem::create_directories(this->path);
}
void tmp_dir::execute(const std::string& cmd) const
{
std::string s = "cd " + path.string() + "; " + cmd;
std::system(s.c_str());
}
tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); }
} // namespace rtc
\ No newline at end of file
...@@ -20,7 +20,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi ...@@ -20,7 +20,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
* Algorithm complexity reduction for complex ML operators using an innovative technique called * Algorithm complexity reduction for complex ML operators using an innovative technique called
"Tensor Coordinate Transformation". "Tensor Coordinate Transformation".
.. image:: data/ck_component.png .. image:: ../data/ck_component.png
:alt: CK Components :alt: CK Components
...@@ -36,6 +36,6 @@ The CK library is structured into 4 layers: ...@@ -36,6 +36,6 @@ The CK library is structured into 4 layers:
It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code. It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
.. image:: data/ck_layer.png .. image:: ../data/ck_layer.png
:alt: CK Layers :alt: CK Layers
\ No newline at end of file
...@@ -45,3 +45,5 @@ for sphinx_var in ROCmDocs.SPHINX_VARS: ...@@ -45,3 +45,5 @@ for sphinx_var in ROCmDocs.SPHINX_VARS:
extensions += ['sphinxcontrib.bibtex'] extensions += ['sphinxcontrib.bibtex']
bibtex_bibfiles = ['refs.bib'] bibtex_bibfiles = ['refs.bib']
cpp_id_attributes = ["__global__", "__device__", "__host__"]
...@@ -12,27 +12,26 @@ The Composable Kernel (CK) library provides a programming model for writing perf ...@@ -12,27 +12,26 @@ The Composable Kernel (CK) library provides a programming model for writing perf
The CK documentation is structured as follows: The CK documentation is structured as follows:
.. card:: Conceptual .. grid:: 2
:gutter: 3
* :ref:`what-is-ck` .. grid-item-card:: Installation
.. card:: Installation * :ref:`docker-hub`
* :ref:`docker-hub` .. grid-item-card:: Conceptual
.. card:: Tutorial * :ref:`what-is-ck`
* :ref:`hello-world` .. grid-item-card:: API reference
.. card:: API reference * :ref:`supported-primitives`
* :ref:`api-reference`
* :ref:`wrapper`
* :ref:`supported-primitives` .. grid-item-card:: Tutorial
* :ref:`api-reference`
* :ref:`wrapper`
.. card:: Contributing to CK * :ref:`hello-world`
* :ref:`contributing-to`
To contribute to the documentation refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/index.html>`_. To contribute to the documentation refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/index.html>`_.
......
...@@ -36,7 +36,7 @@ What is inside the image? ...@@ -36,7 +36,7 @@ What is inside the image?
The docker images have everything you need for running CK including: The docker images have everything you need for running CK including:
* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_ * `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_
* `CMake <https://cmake.org/getting-started/>`_ * `CMake <https://cmake.org/getting-started/>`_
* `Compiler <https://github.com/ROCm/llvm-project>`_ * `Compiler <https://github.com/ROCm/llvm-project>`_
* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_ * `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
......
```{include} ../LICENSE.md
```
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _license:
********************************************************************
License
********************************************************************
.. include:: ../LICENSE
\ No newline at end of file
...@@ -64,31 +64,31 @@ Advanced examples: ...@@ -64,31 +64,31 @@ Advanced examples:
Layout Layout
------------------------------------- -------------------------------------
.. doxygenstruct:: ck::wrapper::Layout .. doxygenstruct:: Layout
------------------------------------- -------------------------------------
Layout helpers Layout helpers
------------------------------------- -------------------------------------
.. doxygenfile:: layout_utils.hpp .. doxygenfile:: include/ck/wrapper/utils/layout_utils.hpp
------------------------------------- -------------------------------------
Tensor Tensor
------------------------------------- -------------------------------------
.. doxygenstruct:: ck::wrapper::Tensor .. doxygenstruct:: Tensor
------------------------------------- -------------------------------------
Tensor helpers Tensor helpers
------------------------------------- -------------------------------------
.. doxygenfile:: tensor_utils.hpp .. doxygenfile:: include/ck/wrapper/utils/tensor_utils.hpp
.. doxygenfile:: tensor_partition.hpp .. doxygenfile:: include/ck/wrapper/utils/tensor_partition.hpp
------------------------------------- -------------------------------------
Operations Operations
------------------------------------- -------------------------------------
.. doxygenfile:: copy.hpp .. doxygenfile:: include/ck/wrapper/operations/copy.hpp
.. doxygenfile:: gemm.hpp .. doxygenfile:: include/ck/wrapper/operations/gemm.hpp
...@@ -2,20 +2,35 @@ defaults: ...@@ -2,20 +2,35 @@ defaults:
numbered: False numbered: False
root: index root: index
subtrees: subtrees:
- entries:
- file: what-is-ck.rst - caption: Conceptual
entries:
- file: conceptual/what-is-ck.rst
title: What is Composable Kernel? title: What is Composable Kernel?
- file: dockerhub.rst
- caption: Install
entries:
- file: install/dockerhub.rst
title: Docker Hub title: Docker Hub
- file: tutorial_hello_world.rst
title: Hello World Tutorial - caption: CK API Reference
- file: Supported_Primitives_Guide.rst entries:
- file: reference/Supported_Primitives_Guide.rst
title: Supported Primitives title: Supported Primitives
- file: API_Reference_Guide.rst - file: reference/API_Reference_Guide.rst
title: API Reference title: API Reference
- file: wrapper.rst - file: reference/wrapper.rst
title: Wrapper title: Wrapper
- caption: Tutorial
entries:
- file: tutorial/tutorial_hello_world.rst
title: Hello World Tutorial
- caption: About
entries:
- file: Contributors_Guide.rst - file: Contributors_Guide.rst
title: Contributing to CK title: Contributing to CK
- file: license.md - file: license.rst
title: License title: License
\ No newline at end of file
rocm-docs-core==0.34.0 rocm-docs-core==0.36.0
sphinxcontrib-bibtex==2.6.2 sphinxcontrib-bibtex==2.6.2
...@@ -113,7 +113,7 @@ requests==2.31.0 ...@@ -113,7 +113,7 @@ requests==2.31.0
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==0.34.0 rocm-docs-core==0.36.0
# via -r requirements.in # via -r requirements.in
six==1.16.0 six==1.16.0
# via # via
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment