Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0aa899aa
Commit
0aa899aa
authored
Apr 06, 2022
by
Jehandad Khan
Browse files
add hipEvent based timing to kernels
parent
44757d6b
Changes
46
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
107 additions
and
43 deletions
+107
-43
include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
.../gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
+5
-3
include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
.../device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
+5
-3
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
...ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
...operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
...nsor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
...k/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
...ration/gpu/device/device_reduce_blockwise_second_call.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
...ration/gpu/device/device_reduce_multiblock_atomic_add.hpp
+3
-3
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
...on/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+4
-3
library/include/ck/library/host/host_interface.hpp
library/include/ck/library/host/host_interface.hpp
+2
-2
library/include/ck/library/host_tensor/device.hpp
library/include/ck/library/host_tensor/device.hpp
+56
-3
library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
...reference_tensor_operation/cpu/reference_batched_gemm.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
...e_tensor_operation/cpu/reference_conv_backward_weight.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
...eference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
...ary/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
...nsor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
..._operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
...library/reference_tensor_operation/cpu/reference_gemm.hpp
+1
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
...reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+1
-1
No files found.
include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
View file @
0aa899aa
...
...
@@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
{
using
Argument
=
DeviceOp
::
Argument
;
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
{
std
::
cout
<<
"arg.a_grid_desc_k0_m_k1_{"
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
...
...
@@ -336,6 +336,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
...
...
@@ -376,6 +377,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
...
...
@@ -394,9 +396,9 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
}
};
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
View file @
0aa899aa
...
...
@@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
{
using
Argument
=
DeviceOp
::
Argument
;
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
{
std
::
cout
<<
"arg.a_grid_desc_k0_m_k1_{"
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
...
...
@@ -381,6 +381,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
...
...
@@ -426,6 +427,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
...
...
@@ -446,9 +448,9 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
}
};
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
View file @
0aa899aa
...
...
@@ -385,7 +385,7 @@ struct DeviceGemmXdlSplitK
std
::
cout
<<
"arg.c_grid_desc_m_n_{ "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
}
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
const
auto
kbatch
=
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I0
);
...
...
@@ -417,6 +417,7 @@ struct DeviceGemmXdlSplitK
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
...
...
@@ -533,9 +534,9 @@ struct DeviceGemmXdlSplitK
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
}
};
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
View file @
0aa899aa
...
...
@@ -391,7 +391,7 @@ struct DeviceGemmXdlSplitKCShuffle
std
::
cout
<<
"arg.c_grid_desc_m_n_{ "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
}
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
const
auto
kbatch
=
arg
.
a_grid_desc_kbatch_k0_m_k1_
.
GetLength
(
I0
);
...
...
@@ -424,6 +424,7 @@ struct DeviceGemmXdlSplitKCShuffle
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
...
...
@@ -544,9 +545,9 @@ struct DeviceGemmXdlSplitKCShuffle
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
}
};
...
...
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
View file @
0aa899aa
...
...
@@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
using
gridwise_reduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
OutDataType
,
...
...
@@ -247,6 +247,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
arg
.
a_grid_desc_m_k_
,
arg
.
b_grid_desc_m_
,
arg
.
in_element_op_
,
...
...
@@ -258,9 +259,9 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
arg
.
p_out_indices_dev_
);
}
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
}
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
View file @
0aa899aa
...
...
@@ -198,7 +198,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceBlockWise
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
...
...
@@ -246,6 +246,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
...
...
@@ -260,9 +261,9 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
};
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
View file @
0aa899aa
...
...
@@ -175,7 +175,7 @@ struct DeviceReduceBlockWiseSecondCall
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceBlockWiseSecondCall
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
...
...
@@ -223,6 +223,7 @@ struct DeviceReduceBlockWiseSecondCall
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
...
...
@@ -237,9 +238,9 @@ struct DeviceReduceBlockWiseSecondCall
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
};
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
View file @
0aa899aa
...
...
@@ -234,7 +234,7 @@ struct DeviceReduceMultiBlockAtomicAdd
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
=
false
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlockAtomicAdd
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
);
...
...
@@ -318,9 +318,9 @@ struct DeviceReduceMultiBlockAtomicAdd
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
};
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
View file @
0aa899aa
...
...
@@ -259,7 +259,7 @@ struct DeviceReduceMultiBlockPartialReduce
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlockPartialReduce
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
);
...
...
@@ -305,6 +305,7 @@ struct DeviceReduceMultiBlockPartialReduce
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
in_grid_desc_m_k
,
ws_desc_m_k
,
arg
.
in_elementwise_op_
,
...
...
@@ -318,9 +319,9 @@ struct DeviceReduceMultiBlockPartialReduce
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
};
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
View file @
0aa899aa
...
...
@@ -198,7 +198,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceThreadWise
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
...
...
@@ -246,6 +246,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
dim3
(
BlockSize
),
0
,
stream_id
,
measure_time
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
...
...
@@ -259,9 +260,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
,
hipStream_t
stream_id
=
nullptr
,
bool
measure_time
=
false
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
,
stream_id
,
measure_time
);
};
};
...
...
library/include/ck/library/host/host_interface.hpp
View file @
0aa899aa
...
...
@@ -28,8 +28,8 @@ struct DeviceConvFwdPtr_t
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
);
// in,wei and out element ops are ignored for now since even if we change them, they cant be linked
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
();
// requires including BaseInvoker headers
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
const
;
// in,wei and out element ops are ignored for now since even if we change them, they cant be linked
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
const
;
// requires including BaseInvoker headers
std
::
string
GetTypeString
();
bool
IsSupportedArgument
(
const
BaseArgument
*
arg_ptr
);
};
...
...
library/include/ck/library/host_tensor/device.hpp
View file @
0aa899aa
#ifndef DEVICE_HPP
#define DEVICE_HPP
#include "ck/options.hpp"
#include <memory>
#include <functional>
#include <thread>
...
...
@@ -8,6 +10,39 @@
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
inline
void
hip_check
(
hipError_t
x
)
{
if
(
x
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Failed to run HIP call"
);
}
template
<
typename
F
,
F
f
>
struct
managed_deleter
{
template
<
typename
T
>
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
std
::
ignore
=
f
(
t
);
}
}
};
template
<
typename
T
,
typename
F
,
F
f
>
using
managed_pointer
=
std
::
unique_ptr
<
T
,
managed_deleter
<
F
,
f
>>
;
using
hipEventPtr
=
managed_pointer
<
typename
std
::
remove_pointer
<
hipEvent_t
>::
type
,
decltype
(
&
hipEventDestroy
),
hipEventDestroy
>
;
inline
hipEventPtr
make_hip_event
()
{
hipEvent_t
result
=
nullptr
;
hip_check
(
hipEventCreate
(
&
result
));
return
hipEventPtr
{
result
};
}
struct
DeviceMem
{
DeviceMem
()
=
delete
;
...
...
@@ -44,9 +79,9 @@ void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte
template
<
typename
...
Args
,
typename
F
>
float
launch_and_time_kernel
(
F
kernel
,
int
nrepeat
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
hipStream_t
stream_id
,
Args
...
args
)
F
kernel
,
int
nrepeat
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
hipStream_t
stream_id
,
bool
measure_time
,
Args
...
args
)
{
#if
1
#if
CK_TIME_KERNELS
KernelTimer
timer
;
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
...
...
@@ -78,9 +113,27 @@ float launch_and_time_kernel(
return
timer
.
GetElapsedTime
()
/
nrepeat
;
#else
std
::
ignore
=
nrepeat
;
hipEventPtr
start
=
nullptr
;
hipEventPtr
stop
=
nullptr
;
float
elapsed_time
=
0.0
f
;
if
(
measure_time
)
{
start
=
make_hip_event
();
stop
=
make_hip_event
();
hip_check
(
hipEventRecord
(
start
.
get
(),
stream_id
));
}
launch_kernel
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
stream_id
,
args
...);
return
0
;
if
(
measure_time
)
{
hip_check
(
hipEventRecord
(
stop
.
get
(),
stream_id
));
hip_check
(
hipEventSynchronize
(
stop
.
get
()));
hip_check
(
hipEventElapsedTime
(
&
elapsed_time
,
start
.
get
(),
stop
.
get
()));
}
return
elapsed_time
;
#endif
}
#endif
library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
View file @
0aa899aa
...
...
@@ -84,7 +84,7 @@ struct ReferenceBatchedGemm : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
View file @
0aa899aa
...
...
@@ -114,7 +114,7 @@ struct ReferenceConvWrw : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
View file @
0aa899aa
...
...
@@ -129,7 +129,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
View file @
0aa899aa
...
...
@@ -171,7 +171,7 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
View file @
0aa899aa
...
...
@@ -117,7 +117,7 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
View file @
0aa899aa
...
...
@@ -123,7 +123,7 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
View file @
0aa899aa
...
...
@@ -82,7 +82,7 @@ struct ReferenceGemm : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
View file @
0aa899aa
...
...
@@ -82,7 +82,7 @@ struct ReferenceGemmBias2D : public device::BaseOperator
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
)
override
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
int
,
hipStream_t
,
bool
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment