Commit 0b9fe840 authored by carlushuang's avatar carlushuang
Browse files

add aligned memory type, wall timer

parent 4bdeeb33
......@@ -23,6 +23,20 @@ struct DeviceMem
std::size_t mMemSize;
};
struct DeviceAlignedMemCPU
{
DeviceAlignedMemCPU() = delete;
DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
void* GetDeviceBuffer();
std::size_t GetBufferSize();
void SetZero();
~DeviceAlignedMemCPU();
void* mpDeviceBuf;
std::size_t mMemSize;
std::size_t mAlignment;
};
struct KernelTimerImpl;
struct KernelTimer
......@@ -36,6 +50,19 @@ struct KernelTimer
std::unique_ptr<KernelTimerImpl> impl;
};
struct WallTimerImpl;
struct WallTimer
{
WallTimer();
~WallTimer();
void Start();
void End();
float GetElapsedTime() const;
std::unique_ptr<WallTimerImpl> impl;
};
using device_stream_t = hipStream_t;
template <typename... Args, typename F>
......
#include <chrono>
#include "device.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
......@@ -24,6 +25,30 @@ void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment)
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1;
void** p2;
int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset);
assert(p1 != nullptr);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2);
}
void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); }
struct KernelTimerImpl
{
KernelTimerImpl()
......@@ -69,3 +94,30 @@ void KernelTimer::Start() { impl->Start(); }
void KernelTimer::End() { impl->End(); }
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
struct WallTimerImpl
{
void Start() { mStart = std::chrono::high_resolution_clock::now(); }
void End() { mStop = std::chrono::high_resolution_clock::now(); }
float GetElapsedTime() const
{
return static_cast<float>(
std::chrono::duration_cast<std::chrono::microseconds>(mStop - mStart).count()) *
1e-3;
}
std::chrono::time_point<std::chrono::high_resolution_clock> mStart;
std::chrono::time_point<std::chrono::high_resolution_clock> mStop;
};
WallTimer::WallTimer() : impl(new WallTimerImpl()) {}
WallTimer::~WallTimer() {}
void WallTimer::Start() { impl->Start(); }
void WallTimer::End() { impl->End(); }
float WallTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
add_test_executable(test_cpu_gemm_uk cpu_gemm_uk.cpp)
target_link_libraries(test_cpu_gemm_uk PRIVATE host_tensor)
......@@ -6,8 +6,9 @@
#include <sstream>
#include <tuple>
#include <memory>
#include <chrono>
#include <half.hpp>
#include "host_tensor.hpp"
#include "device.hpp"
#include "config.hpp"
#include "print.hpp"
#include "cpuid.hpp"
......@@ -128,24 +129,6 @@ void dump_cache_hierarchy()
}
}
void* __aligned_malloc(size_t required_bytes, size_t alignment)
{
if(alignment == 0 || (alignment & (alignment - 1))) // check pow of 2
return nullptr;
void* p1; // original block
void** p2; // aligned block
int offset = alignment - 1 + sizeof(void*);
if((p1 = malloc(required_bytes + offset)) == nullptr)
{
return nullptr;
}
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
p2[-1] = p1;
return p2;
}
void __aligned_free(void* p) { free((reinterpret_cast<void**>(p))[-1]); }
template <typename T>
void rand_vector(T* v, int elem)
{
......@@ -186,30 +169,35 @@ template <typename FloatA, typename FloatB, typename ALayout, typename BLayout>
void ref_cpu_gemm_uk(
const FloatA* a, const FloatB* b, float* c, float alpha, uint32_t m, uint32_t n, uint32_t k)
{
auto a_offset = [&](uint32_t im, uint32_t ik) {
if constexpr(std::is_same<Row, ALayout>::value)
auto f_host_2d_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), Row>::value)
{
return im * k + ik;
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return ik * m + im;
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
auto b_offset = [&](uint32_t ik, uint32_t in) {
if constexpr(std::is_same<Row, BLayout>::value)
{
return ik * n + in;
}
else
{
// n*k*n8
return (in / 8) * k * 8 + ik * 8 + in % 8;
}
auto f_host_vectored_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t vec, std::size_t stride) {
// only valid in row major. stride is for each row, contains vector size
return HostTensorDescriptor(std::vector<std::size_t>({row, col, vec}),
std::vector<std::size_t>({stride, vec, 1}));
};
auto c_offset = [&](uint32_t im, uint32_t in) { return im * n + in; };
std::size_t lda = std::is_same<Row, ALayout>::value ? k : m; // in unit of element
std::size_t ldb = std::is_same<Row, BLayout>::value ? n : k * 8; // in unit of element
std::size_t ldc = n;
HostTensorDescriptor a_m_k = f_host_2d_tensor_descriptor(m, n, lda, ALayout{});
HostTensorDescriptor b_k_n = std::is_same<Row, BLayout>::value
? f_host_2d_tensor_descriptor(k, n, ldb, BLayout{})
: f_host_vectored_tensor_descriptor(n / 8, k, 8, ldb);
HostTensorDescriptor c_m_n = f_host_2d_tensor_descriptor(m, n, ldc, Row{});
for(uint32_t im = 0; im < m; im++)
{
......@@ -218,11 +206,14 @@ void ref_cpu_gemm_uk(
float acc = .0f;
for(uint32_t ik = 0; ik < k; ik++)
{
acc += static_cast<float>(a[a_offset(im, ik)]) *
static_cast<float>(b[b_offset(ik, in)]);
acc += static_cast<float>(a[a_m_k.GetOffsetFromMultiIndex(im, ik)]) *
(std::is_same<Row, BLayout>::value
? static_cast<float>(b[b_k_n.GetOffsetFromMultiIndex(ik, in)])
: static_cast<float>(
b[b_k_n.GetOffsetFromMultiIndex(in / 8, ik, in % 8)]));
}
acc *= alpha;
c[c_offset(im, in)] = acc;
c[c_m_n.GetOffsetFromMultiIndex(im, in)] = acc;
}
}
}
......@@ -326,17 +317,17 @@ void test_ukernel(ukenrel_t uk,
invoke_uk();
}
auto t0 = std::chrono::high_resolution_clock::now();
WallTimer timer;
timer.Start();
for(int i = 0; i < repeat; i++)
{
invoke_uk();
}
auto t1 = std::chrono::high_resolution_clock::now();
timer.End();
double us = static_cast<double>(
std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count()) /
repeat;
double gflops = static_cast<double>(2 * m * n * k) * 1e-3 / us;
float us = timer.GetElapsedTime() * 1e3 / repeat;
float gflops = static_cast<float>(2 * m * n * k) * 1e-3 / us;
memset(mat_c, 0, m * n * sizeof(float));
invoke_uk();
......@@ -349,20 +340,27 @@ void test_ukernel(ukenrel_t uk,
template <typename FloatA, typename FloatB, typename ALayout, typename BLayout>
void test_cpu_ukernel(float alpha, uint32_t m, uint32_t n, uint32_t k)
{
FloatA* mat_a = reinterpret_cast<FloatA*>(__aligned_malloc(m * k * sizeof(FloatA), 32));
FloatB* mat_b = reinterpret_cast<FloatB*>(__aligned_malloc(k * n * sizeof(FloatB), 32));
float* mat_c = reinterpret_cast<float*>(__aligned_malloc(m * n * sizeof(float), 32));
float* mat_c_ref = reinterpret_cast<float*>(__aligned_malloc(m * n * sizeof(float), 32));
memset(mat_c_ref, 0, m * n * sizeof(float));
rand_vector(mat_a, m * k);
rand_vector(mat_b, k * n);
ref_cpu_gemm_uk<FloatA, FloatB, ALayout, BLayout>(mat_a, mat_b, mat_c_ref, alpha, m, n, k);
using thread_gemm_instance = thread_gemm_avx2_mxn_6x16_instances<ALayout, BLayout>;
// using thread_gemm_instance = thread_gemm_avx2_mxn_4x24_instances<ALayout, BLayout>;
DeviceAlignedMemCPU a_mem(m * k * sizeof(FloatA), 32);
DeviceAlignedMemCPU b_mem(k * n * sizeof(FloatB), 32);
DeviceAlignedMemCPU c_mem(m * n * sizeof(float), 32);
DeviceAlignedMemCPU c_mem_ref(m * n * sizeof(float), 32);
c_mem_ref.SetZero();
rand_vector(reinterpret_cast<FloatA*>(a_mem.mpDeviceBuf), m * k);
rand_vector(reinterpret_cast<FloatB*>(b_mem.mpDeviceBuf), k * n);
ref_cpu_gemm_uk<FloatA, FloatB, ALayout, BLayout>(
reinterpret_cast<FloatA*>(a_mem.mpDeviceBuf),
reinterpret_cast<FloatB*>(b_mem.mpDeviceBuf),
reinterpret_cast<float*>(c_mem_ref.mpDeviceBuf),
alpha,
m,
n,
k);
// using thread_gemm_instance = thread_gemm_avx2_mxn_6x16_instances<ALayout, BLayout>;
using thread_gemm_instance = thread_gemm_avx2_mxn_4x24_instances<ALayout, BLayout>;
bool found = false;
ck::static_for<0, std::tuple_size_v<thread_gemm_instance>, 1>{}([&](auto i) {
......@@ -377,24 +375,27 @@ void test_cpu_ukernel(float alpha, uint32_t m, uint32_t n, uint32_t k)
if(found)
return;
test_ukernel<FloatA, FloatB, ALayout, BLayout>(
uk_type{}, mat_a, mat_b, mat_c, alpha, m, n, k);
bool is_valid = valid_vector(mat_c_ref, mat_c, m * n);
test_ukernel<FloatA, FloatB, ALayout, BLayout>(uk_type{},
reinterpret_cast<FloatA*>(a_mem.mpDeviceBuf),
reinterpret_cast<FloatB*>(b_mem.mpDeviceBuf),
reinterpret_cast<float*>(c_mem.mpDeviceBuf),
alpha,
m,
n,
k);
bool is_valid = valid_vector(reinterpret_cast<float*>(c_mem_ref.mpDeviceBuf),
reinterpret_cast<float*>(c_mem.mpDeviceBuf),
m * n);
printf("vald:%s\n", is_valid ? "y" : "n");
found = true;
});
__aligned_free(mat_a);
__aligned_free(mat_b);
__aligned_free(mat_c);
__aligned_free(mat_c_ref);
}
int main(int argc, char** argv)
{
int m = 6;
int n = 16;
int m = 4;
int n = 24;
int k = 64;
float alpha = 1.0f;
if(argc > 3)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment