Commit a6e310af authored by carlushuang's avatar carlushuang
Browse files

support omp and multi-core

parent 7e7640ce
add_test_executable(test_cpu_gemm_uk cpu_gemm_uk.cpp) add_test_executable(test_cpu_gemm_uk cpu_gemm_uk.cpp)
target_link_libraries(test_cpu_gemm_uk PRIVATE host_tensor) target_link_libraries(test_cpu_gemm_uk PRIVATE host_tensor)
# 3.13 introduce target_link_directories, which is better
set_target_properties(test_cpu_gemm_uk PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib )
target_link_libraries(test_cpu_gemm_uk PRIVATE /opt/rocm/llvm/lib/libomp.so)
target_compile_options(test_cpu_gemm_uk PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <tuple> #include <tuple>
#include <memory> #include <memory>
#include <half.hpp> #include <half.hpp>
#include <omp.h>
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "device.hpp" #include "device.hpp"
#include "config.hpp" #include "config.hpp"
...@@ -228,22 +229,14 @@ void test_ukernel(ukenrel_t uk, ...@@ -228,22 +229,14 @@ void test_ukernel(ukenrel_t uk,
uint32_t n, uint32_t n,
uint32_t k) uint32_t k)
{ {
ck::cpu::ThreadwiseGemmParam param; int max_threads = omp_get_max_threads();
param.p_a = mat_a;
param.p_b = mat_b; auto invoke_uk = [&](ck::cpu::ThreadwiseGemmParam& param, float* current_mat_c) {
param.p_c = mat_c;
param.Kr = k;
param.lda = (std::is_same<Row, ALayout>::value ? k : m) * sizeof(FloatA);
param.ldb = (std::is_same<Row, BLayout>::value ? n : k * 8) * sizeof(FloatB);
param.ldc = n * sizeof(float);
param.alpha = alpha;
auto invoke_uk = [&]() {
if constexpr(std::is_same<Row, ALayout>::value && std::is_same<Row, BLayout>::value) if constexpr(std::is_same<Row, ALayout>::value && std::is_same<Row, BLayout>::value)
{ {
assert(m % uk.Mr_ == 0 && n == uk.Nr_); assert(m % uk.Mr_ == 0 && n == uk.Nr_);
FloatA* p_a = mat_a; FloatA* p_a = mat_a;
float* p_c = mat_c; float* p_c = current_mat_c;
param.p_a = p_a; param.p_a = p_a;
param.p_c = p_c; param.p_c = p_c;
for(uint32_t i_m = 0; i_m < m; i_m += uk.Mr_) for(uint32_t i_m = 0; i_m < m; i_m += uk.Mr_)
...@@ -259,7 +252,7 @@ void test_ukernel(ukenrel_t uk, ...@@ -259,7 +252,7 @@ void test_ukernel(ukenrel_t uk,
{ {
assert(m % uk.Mr_ == 0 && n % uk.Nr_ == 0); assert(m % uk.Mr_ == 0 && n % uk.Nr_ == 0);
FloatA* p_a = mat_a; FloatA* p_a = mat_a;
float* p_c = mat_c; float* p_c = current_mat_c;
param.p_a = p_a; param.p_a = p_a;
param.p_b = mat_b; param.p_b = mat_b;
param.p_c = p_c; param.p_c = p_c;
...@@ -291,7 +284,7 @@ void test_ukernel(ukenrel_t uk, ...@@ -291,7 +284,7 @@ void test_ukernel(ukenrel_t uk,
{ {
assert(m % uk.Mr_ == 0 && n % uk.Nr_ == 0); assert(m % uk.Mr_ == 0 && n % uk.Nr_ == 0);
FloatB* p_b = mat_b; FloatB* p_b = mat_b;
float* p_c = mat_c; float* p_c = current_mat_c;
param.p_b = p_b; param.p_b = p_b;
param.p_c = p_c; param.p_c = p_c;
for(uint32_t i_n = 0; i_n < n; i_n += uk.Nr_) for(uint32_t i_n = 0; i_n < n; i_n += uk.Nr_)
...@@ -308,29 +301,54 @@ void test_ukernel(ukenrel_t uk, ...@@ -308,29 +301,54 @@ void test_ukernel(ukenrel_t uk,
printf("gemm_uk_%dx%d_%c%c: ", uk.Mr_, uk.Nr_, ALayout::name[0], BLayout::name[0]); printf("gemm_uk_%dx%d_%c%c: ", uk.Mr_, uk.Nr_, ALayout::name[0], BLayout::name[0]);
fflush(stdout); fflush(stdout);
// printf("%s: ", typeid(uk).name());fflush(stdout); // printf("%s: ", typeid(uk).name());fflush(stdout);
memset(mat_c, 0, m * n * sizeof(float));
int repeat = 7e10 / (2 * m * n * k); float us = .0f;
for(int i = 0; i < (repeat / 5); i++) #pragma omp parallel reduction(+ : us)
{ {
invoke_uk(); int tid = omp_get_thread_num();
} float* private_c = reinterpret_cast<float*>(malloc(m * n * sizeof(float)));
WallTimer timer; ck::cpu::ThreadwiseGemmParam param;
param.p_a = mat_a;
param.p_b = mat_b;
param.p_c = private_c;
param.Kr = k;
param.lda = (std::is_same<Row, ALayout>::value ? k : m) * sizeof(FloatA);
param.ldb = (std::is_same<Row, BLayout>::value ? n : k * 8) * sizeof(FloatB);
param.ldc = n * sizeof(float);
param.alpha = alpha;
timer.Start(); memset(private_c, 0, m * n * sizeof(float));
for(int i = 0; i < repeat; i++)
{ int repeat = 7e10 / (2 * m * n * k);
invoke_uk();
for(int i = 0; i < (repeat / 5); i++)
{
invoke_uk(param, private_c);
}
WallTimer timer;
timer.Start();
for(int i = 0; i < repeat; i++)
{
invoke_uk(param, private_c);
}
timer.End();
us += timer.GetElapsedTime() * 1e3 / repeat;
memset(private_c, 0, m * n * sizeof(float));
invoke_uk(param, private_c);
memcpy(mat_c + tid * m * n, private_c, m * n * sizeof(float));
free(private_c);
} }
timer.End();
float us = timer.GetElapsedTime() * 1e3 / repeat; us = us / max_threads;
float gflops = static_cast<float>(2 * m * n * k) * 1e-3 / us;
memset(mat_c, 0, m * n * sizeof(float)); float gflops = static_cast<float>(2 * m * n * k * max_threads) * 1e-3 / us;
invoke_uk();
printf("m:%u, n:%u, k:%u, alpha:%f, cost:%lfus, GFLOPS:%lf, ", m, n, k, alpha, us, gflops); printf("m:%u, n:%u, k:%u, alpha:%f, cost:%lfus, GFLOPS:%lf, ", m, n, k, alpha, us, gflops);
fflush(stdout); fflush(stdout);
...@@ -340,10 +358,11 @@ void test_ukernel(ukenrel_t uk, ...@@ -340,10 +358,11 @@ void test_ukernel(ukenrel_t uk,
template <typename FloatA, typename FloatB, typename ALayout, typename BLayout> template <typename FloatA, typename FloatB, typename ALayout, typename BLayout>
void test_cpu_ukernel(float alpha, uint32_t m, uint32_t n, uint32_t k) void test_cpu_ukernel(float alpha, uint32_t m, uint32_t n, uint32_t k)
{ {
int max_threads = omp_get_max_threads();
DeviceAlignedMemCPU a_mem(m * k * sizeof(FloatA), 32); DeviceAlignedMemCPU a_mem(m * k * sizeof(FloatA), 32);
DeviceAlignedMemCPU b_mem(k * n * sizeof(FloatB), 32); DeviceAlignedMemCPU b_mem(k * n * sizeof(FloatB), 32);
DeviceAlignedMemCPU c_mem(m * n * sizeof(float), 32); DeviceAlignedMemCPU c_mem(m * n * sizeof(float) * max_threads, 32);
DeviceAlignedMemCPU c_mem_ref(m * n * sizeof(float), 32); DeviceAlignedMemCPU c_mem_ref(m * n * sizeof(float), 32);
c_mem_ref.SetZero(); c_mem_ref.SetZero();
...@@ -409,6 +428,10 @@ int main(int argc, char** argv) ...@@ -409,6 +428,10 @@ int main(int argc, char** argv)
alpha = std::atof(argv[4]); alpha = std::atof(argv[4]);
} }
dump_cache_hierarchy(); dump_cache_hierarchy();
if(std::getenv("OMP_NUM_THREADS") == nullptr)
omp_set_num_threads(1);
printf("max threads:%d\n", omp_get_max_threads());
test_cpu_ukernel<AType, BType, Row, Row>(alpha, m, n, k); test_cpu_ukernel<AType, BType, Row, Row>(alpha, m, n, k);
test_cpu_ukernel<AType, BType, Row, Col>(alpha, m, n, k); test_cpu_ukernel<AType, BType, Row, Col>(alpha, m, n, k);
test_cpu_ukernel<AType, BType, Col, Row>(alpha, m, n, k); test_cpu_ukernel<AType, BType, Col, Row>(alpha, m, n, k);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment