Commit 1e95a6e2 authored by carlushuang's avatar carlushuang
Browse files

update script

parent 50ba9c44
......@@ -17,6 +17,10 @@
#define TEST_ELEMENTWISE_VERBOSE 1
#endif
#ifndef TEST_ELEMENTWISE_HIPGRAPH
#define TEST_ELEMENTWISE_HIPGRAPH 1
#endif
template <typename T>
void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
{
......@@ -149,6 +153,13 @@ bool test_cast(ck_tile::ArgParser args)
t_.input_type = input_prec;
t_.output_type = output_prec;
t_.op = std::string("cast");
t_.num_cu = [&]() {
hipDeviceProp_t dev_prop;
hipDevice_t dev;
HIP_CHECK_ERROR(hipGetDevice(&dev));
HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
return dev_prop.multiProcessorCount;
}();
return t_;
}();
......@@ -161,11 +172,65 @@ bool test_cast(ck_tile::ArgParser args)
}();
#if TEST_ELEMENTWISE_VERBOSE
ck_tile::stream_config sc{nullptr, true};
#if !TEST_ELEMENTWISE_HIPGRAPH
ck_tile::stream_config sc{nullptr, true, 0, 20, 50, false};
// ck_tile::stream_config sc{nullptr};
auto ms = elementwise(trait, karg, sc);
#else
float ms = 0;
{
int repeat = 50;
int warpup = 20;
hipGraph_t graph_;
hipStream_t stream_;
HIP_CHECK_ERROR(hipStreamCreate(&stream_));
ck_tile::stream_config sc{stream_};
HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
for(int i_r = 0; i_r < repeat; i_r++) {
elementwise(trait, karg, sc);
}
HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
hipGraphExec_t instance_;
HIP_CHECK_ERROR(hipGraphInstantiate(&instance_, graph_, nullptr, nullptr, 0));
hipEvent_t start_, stop_;
HIP_CHECK_ERROR(hipEventCreate(&start_));
HIP_CHECK_ERROR(hipEventCreate(&stop_));
//warm-up
for(int i_r = 0; i_r < warpup; i_r++) {
elementwise(trait, karg, sc);
}
HIP_CHECK_ERROR(hipDeviceSynchronize());
HIP_CHECK_ERROR(hipEventRecord(start_, sc.stream_id_));
HIP_CHECK_ERROR(hipGraphLaunch(instance_, sc.stream_id_));
HIP_CHECK_ERROR(hipEventRecord(stop_, sc.stream_id_));
HIP_CHECK_ERROR(hipEventSynchronize(stop_));
HIP_CHECK_ERROR(hipGetLastError());
HIP_CHECK_ERROR(hipGraphDestroy(graph_));
float total_time = 0;
HIP_CHECK_ERROR(hipEventElapsedTime(&total_time, start_, stop_));
ms = total_time / repeat;
}
#endif
auto gbps = [&](){
double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
return total_bytes / 1.E6 / ms;
}();
printf(
"[cast] %s->%s, n:%lu, ms:%f, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms);
"[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps);
if(ms < 0)
printf("not supported\n");
fflush(stdout);
......
......@@ -11,12 +11,12 @@ struct Cast
};
} // namespace impl
#define DISPATCH_ELEMENTWISE_CAST(d_type_, s_type_, byte_per_issue_, chunks_) \
#define DISPATCH_E_CAST_(d_type_, s_type_, byte_per_issue_, chunks_, bs_) \
using src_t = s_type_; \
using dst_t = d_type_; \
using u_fun = typename impl::Cast; \
using problem = \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\
......@@ -25,7 +25,7 @@ struct Cast
constexpr dim3 blocks = kernel::BlockSize(); \
\
float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
......@@ -35,11 +35,51 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
{
if(t.output_type == "fp32" && t.input_type == "fp16")
{
DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, 8*sizeof(ck_tile::fp16_t), 8)
constexpr int eb = sizeof(ck_tile::fp16_t);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256)
}
else if (a.num_pixels % 4 == 0) {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
}
else {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
}
}
else {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
}
}
else if(t.output_type == "fp16" && t.input_type == "fp32")
{
DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, 4*sizeof(float), 8)
constexpr int eb = sizeof(float);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
}
else if (a.num_pixels % 4 == 0) {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
}
else {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
}
}
else {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
}
}
}
return rtn;
......
......@@ -13,6 +13,7 @@ struct elementwise_trait
std::string acc_type; // type to do intermediate computation
std::string output_type; // type to store out
std::string op;
int num_cu;
};
struct elementwise_kargs : public ck_tile::ElementwiseUnaryHostArgs
......
#!/bin/sh
EXE=./build/bin/tile_example_elementwise
$EXE -pr_i=fp16 -pr_o=fp32 -n=2043904
$EXE -pr_i=fp16 -pr_o=fp32 -n=992256
$EXE -pr_i=fp16 -pr_o=fp32 -n=846304
$EXE -pr_i=fp16 -pr_o=fp32 -n=434176
$EXE -pr_i=fp16 -pr_o=fp32 -n=159424
$EXE -pr_i=fp16 -pr_o=fp32 -n=98304
$EXE -pr_i=fp16 -pr_o=fp32 -n=73728
$EXE -pr_i=fp16 -pr_o=fp32 -n=17408
$EXE -pr_i=fp16 -pr_o=fp32 -n=512
$EXE -pr_i=fp16 -pr_o=fp32 -n=256
echo "-------------------------------------"
$EXE -pr_i=fp32 -pr_o=fp16 -n=2043904
$EXE -pr_i=fp32 -pr_o=fp16 -n=992256
$EXE -pr_i=fp32 -pr_o=fp16 -n=846304
$EXE -pr_i=fp32 -pr_o=fp16 -n=434176
$EXE -pr_i=fp32 -pr_o=fp16 -n=159424
$EXE -pr_i=fp32 -pr_o=fp16 -n=98304
$EXE -pr_i=fp32 -pr_o=fp16 -n=73728
$EXE -pr_i=fp32 -pr_o=fp16 -n=17408
$EXE -pr_i=fp32 -pr_o=fp16 -n=512
$EXE -pr_i=fp32 -pr_o=fp16 -n=256
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment