"..._static/git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "c57b8b184bcd21723df284767f8262839d9c60d6"
Commit c8d9660f authored by coderfeli's avatar coderfeli
Browse files

using develop branch timer

parent 031ddf35
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#pragma once #pragma once
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include <set> #include <set>
#include <vector> #include <vector>
...@@ -43,8 +42,8 @@ struct RotatingMemWrapperMultiD ...@@ -43,8 +42,8 @@ struct RotatingMemWrapperMultiD
{ {
{ {
void* pADeviceBuf; void* pADeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_)); hip_check_error(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), hip_check_error(hipMemcpy(static_cast<void*>(pADeviceBuf),
const_cast<void*>(p_a_grids[0]), const_cast<void*>(p_a_grids[0]),
size_a_, size_a_,
hipMemcpyDeviceToDevice)); hipMemcpyDeviceToDevice));
...@@ -53,8 +52,8 @@ struct RotatingMemWrapperMultiD ...@@ -53,8 +52,8 @@ struct RotatingMemWrapperMultiD
{ {
void* pBDeviceBuf; void* pBDeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_)); hip_check_error(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), hip_check_error(hipMemcpy(static_cast<void*>(pBDeviceBuf),
const_cast<void*>(p_b_grids[0]), const_cast<void*>(p_b_grids[0]),
size_b_, size_b_,
hipMemcpyDeviceToDevice)); hipMemcpyDeviceToDevice));
...@@ -66,8 +65,8 @@ struct RotatingMemWrapperMultiD ...@@ -66,8 +65,8 @@ struct RotatingMemWrapperMultiD
DsGridPointer ds_buffer; DsGridPointer ds_buffer;
static_for<0, NumDs, 1>{}([&](auto j) { static_for<0, NumDs, 1>{}([&](auto j) {
void* pDDeviceBuf; void* pDDeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pDDeviceBuf), size_ds_[j])); hip_check_error(hipMalloc(static_cast<void**>(&pDDeviceBuf), size_ds_[j]));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pDDeviceBuf), hip_check_error(hipMemcpy(static_cast<void*>(pDDeviceBuf),
static_cast<const void*>(p_ds_grids[0][j]), static_cast<const void*>(p_ds_grids[0][j]),
size_ds_[j], size_ds_[j],
hipMemcpyDeviceToDevice)); hipMemcpyDeviceToDevice));
...@@ -94,10 +93,8 @@ struct RotatingMemWrapperMultiD ...@@ -94,10 +93,8 @@ struct RotatingMemWrapperMultiD
} }
void Print() void Print()
{ {
std::cout << "RotatingMemWrapperMultiD: { size_a: " << size_a << ", size_b: " << size_b; std::cout << "RotatingMemWrapperMultiD: { size_a: " << size_a << ", size_b: " << size_b
static_for<0, NumDs, 1>{}( << ", rotating_count: " << rotating_count << "}" << std::endl;
[&](auto j) { std::cout << ", size_d" << j.value << ": " << size_ds[j]; });
std::cout << ", rotating_count: " << rotating_count << "}" << std::endl;
} }
~RotatingMemWrapperMultiD() ~RotatingMemWrapperMultiD()
{ {
...@@ -111,35 +108,13 @@ struct RotatingMemWrapperMultiD ...@@ -111,35 +108,13 @@ struct RotatingMemWrapperMultiD
// free device mem // free device mem
for(size_t i = 1; i < rotating_count; i++) for(size_t i = 1; i < rotating_count; i++)
{ {
try hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
{ hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
HIP_CHECK_ERROR(hipFree(const_cast<void*>(p_a_grids[i])));
}
catch(std::runtime_error& re)
{
std::cerr << re.what() << std::endl;
}
try
{
HIP_CHECK_ERROR(hipFree(const_cast<void*>(p_b_grids[i])));
}
catch(std::runtime_error& re)
{
std::cerr << re.what() << std::endl;
}
static_for<0, NumDs, 1>{}([&](auto j) { static_for<0, NumDs, 1>{}([&](auto j) {
using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>; using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>;
try hip_check_error(
{ hipFree(static_cast<void*>(const_cast<DDataType*>(p_ds_grids[i][j]))));
HIP_CHECK_ERROR(
hipFree(static_cast<void*>(const_cast<DDataType*>(p_ds_grids[i][j]))));
}
catch(std::runtime_error& re)
{
std::cerr << re.what() << std::endl;
}
}); });
} }
} }
...@@ -176,8 +151,8 @@ struct RotatingMemWrapper ...@@ -176,8 +151,8 @@ struct RotatingMemWrapper
{ {
{ {
void* pADeviceBuf; void* pADeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_)); hip_check_error(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), hip_check_error(hipMemcpy(static_cast<void*>(pADeviceBuf),
const_cast<void*>(p_a_grids[0]), const_cast<void*>(p_a_grids[0]),
size_a_, size_a_,
hipMemcpyDeviceToDevice)); hipMemcpyDeviceToDevice));
...@@ -186,8 +161,8 @@ struct RotatingMemWrapper ...@@ -186,8 +161,8 @@ struct RotatingMemWrapper
{ {
void* pBDeviceBuf; void* pBDeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_)); hip_check_error(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), hip_check_error(hipMemcpy(static_cast<void*>(pBDeviceBuf),
const_cast<void*>(p_b_grids[0]), const_cast<void*>(p_b_grids[0]),
size_b_, size_b_,
hipMemcpyDeviceToDevice)); hipMemcpyDeviceToDevice));
...@@ -221,23 +196,8 @@ struct RotatingMemWrapper ...@@ -221,23 +196,8 @@ struct RotatingMemWrapper
// free device mem // free device mem
for(size_t i = 1; i < rotating_count; i++) for(size_t i = 1; i < rotating_count; i++)
{ {
try hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
{ hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
HIP_CHECK_ERROR(hipFree(const_cast<void*>(p_a_grids[i])));
}
catch(std::runtime_error& re)
{
std::cerr << re.what() << std::endl;
}
try
{
HIP_CHECK_ERROR(hipFree(const_cast<void*>(p_b_grids[i])));
}
catch(std::runtime_error& re)
{
std::cerr << re.what() << std::endl;
}
} }
} }
} }
...@@ -255,20 +215,25 @@ struct RotatingMemWrapper ...@@ -255,20 +215,25 @@ struct RotatingMemWrapper
inline void flush_icache() inline void flush_icache()
{ {
hipDeviceProp_t deviceProps; hipDeviceProp_t deviceProps;
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
ck::flush_icache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>(); ck::flush_icache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
HIP_CHECK_ERROR(hipGetLastError()); hip_check_error(hipGetLastError());
} }
// if TimePrePress == false, return time does not include preprocess's time // if TimePrePress == false, return time does not include preprocess's time
template <bool TimePreprocess, typename... Args, typename F, typename PreProcessFunc> template <bool TimePreprocess,
typename GemmArgs,
typename... Args,
typename F,
typename PreProcessFunc>
float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
PreProcessFunc preprocess, PreProcessFunc preprocess,
F kernel, F kernel,
dim3 grid_dim, dim3 grid_dim,
dim3 block_dim, dim3 block_dim,
std::size_t lds_byte, std::size_t lds_byte,
GemmArgs& gemm_args,
Args... args) Args... args)
{ {
#if CK_TIME_KERNEL #if CK_TIME_KERNEL
...@@ -291,8 +256,8 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -291,8 +256,8 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
// warm up // warm up
for(int i = 0; i < stream_config.cold_niters_; ++i) for(int i = 0; i < stream_config.cold_niters_; ++i)
{ {
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(gemm_args, args...);
HIP_CHECK_ERROR(hipGetLastError()); hip_check_error(hipGetLastError());
} }
const int nrepeat = stream_config.nrepeat_; const int nrepeat = stream_config.nrepeat_;
...@@ -312,36 +277,54 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -312,36 +277,54 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
#endif #endif
hipEvent_t start, stop; hipEvent_t start, stop;
HIP_CHECK_ERROR(hipEventCreate(&start)); hip_check_error(hipEventCreate(&start));
HIP_CHECK_ERROR(hipEventCreate(&stop)); hip_check_error(hipEventCreate(&stop));
hip_check_error(hipDeviceSynchronize());
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
for(int i = 0; i < nrepeat; ++i) for(int i = 0; i < nrepeat; ++i)
{ {
preprocess(); if constexpr(!TimePreprocess)
{
preprocess();
}
// hipEvent_t start, stop;
// hip_check_error(hipEventCreate(&start));
// hip_check_error(hipEventCreate(&stop));
// hip_check_error(hipDeviceSynchronize());
// hip_check_error(hipEventRecord(start, stream_config.stream_id_));
// calculate preprocess time
if constexpr(TimePreprocess)
{
preprocess();
}
// run real kernel // run real kernel
hipExtLaunchKernelGGL(kernel, kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(gemm_args, args...);
grid_dim, hip_check_error(hipGetLastError());
block_dim,
lds_byte,
stream_config.stream_id_,
start,
stop,
0,
args...);
HIP_CHECK_ERROR(hipGetLastError());
// end real kernel // end real kernel
HIP_CHECK_ERROR(hipEventRecord(stop, stream_config.stream_id_)); // hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
HIP_CHECK_ERROR(hipEventSynchronize(stop)); // hip_check_error(hipEventSynchronize(stop));
// float cur_time = 0;
// hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
// #if MEDIAN
// times.insert(cur_time);
// #else
// total_time += cur_time;
// #endif
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
// std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
float cur_time = 0; printf("gemm_args.p_a_grid: %p, gemm_args.p_b_grid:%p\n",
HIP_CHECK_ERROR(hipEventElapsedTime(&cur_time, start, stop)); static_cast<const void*>(gemm_args.p_a_grid),
#if MEDIAN static_cast<const void*>(gemm_args.p_b_grid));
times.insert(cur_time); }
#else
total_time += cur_time;
#endif
} }
hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
hip_check_error(hipEventSynchronize(stop)); hip_check_error(hipEventSynchronize(stop));
...@@ -367,20 +350,24 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -367,20 +350,24 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
return (*mid + *mid_next) / 2; return (*mid + *mid_next) / 2;
} }
#else #else
return total_time / nrepeat; // return total_time / nrepeat;
hipDeviceProp_t deviceProps;
hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
float preprocess_offset = deviceProps.multiProcessorCount == 80 ? 0.005 : 0.01;
return (total_time - preprocess_offset * nrepeat) / nrepeat;
#endif #endif
} }
else else
{ {
preprocess(); preprocess();
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(gemm_args, args...);
HIP_CHECK_ERROR(hipGetLastError()); hip_check_error(hipGetLastError());
return 0; return 0;
} }
#else #else
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(gemm_args, args...);
HIP_CHECK_ERROR(hipGetLastError()); hip_check_error(hipGetLastError());
return 0; return 0;
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment