Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c6b52884
Commit
c6b52884
authored
May 30, 2022
by
wangshaojie6
Browse files
add unary type convert to bwd-weight example
parent
c4b6b9b1
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
91 additions
and
51 deletions
+91
-51
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+1
-1
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
...nvnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+48
-16
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
.../tensor_operation/gpu/device/device_unary_elementwise.hpp
+9
-11
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+12
-3
include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
...nsor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+12
-20
library/include/ck/library/host_tensor/device.hpp
library/include/ck/library/host_tensor/device.hpp
+9
-0
No files found.
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
View file @
c6b52884
...
@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
...
@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
split_k
);
split_k
);
// alloc work space
// alloc work space
float
ave_time
=
0.
f
;
float
ave_time
=
0.
f
;
if
(
!
conv
->
IsSupportedArgument
(
argument
.
get
()))
if
(
!
conv
->
IsSupportedArgument
(
argument
.
get
()))
{
{
std
::
cout
<<
"wrong! device_conv with the specified compilation parameters does "
std
::
cout
<<
"wrong! device_conv with the specified compilation parameters does "
...
...
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
View file @
c6b52884
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
#include "element_wise_operation.hpp"
#include "element_wise_operation.hpp"
#include "device_unary_elementwise.hpp"
#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp"
#include "reference_conv_backward_weight.hpp"
...
@@ -30,6 +31,11 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
...
@@ -30,6 +31,11 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryTypeConvert
=
ck
::
tensor_operation
::
element_wise
::
UnaryTypeConvert
<
ck
::
bhalf_t
,
float
>
;
using
DeviceUnaryElementwiseTypeConvertInstance
=
ck
::
tensor_operation
::
device
::
DeviceUnaryElementwise
<
AccDataType
,
WeiDataType
,
UnaryTypeConvert
,
1
,
4
>
;
static
constexpr
auto
ConvBwdWeightDefault
=
static
constexpr
auto
ConvBwdWeightDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionBackwardWeightSpecialization
::
Default
;
ck
::
tensor_operation
::
device
::
ConvolutionBackwardWeightSpecialization
::
Default
;
...
@@ -95,7 +101,7 @@ void host_elementwise(HostTensorB& B,
...
@@ -95,7 +101,7 @@ void host_elementwise(HostTensorB& B,
Functor
functor
)
Functor
functor
)
{
{
size_t
tensor_size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
size_t
tensor_size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
std
::
cout
<<
__LINE__
<<
":"
<<
tensor_size
<<
", "
<<
A
.
mData
[
0
]
<<
std
::
endl
;
std
::
cout
<<
__LINE__
<<
":"
<<
tensor_size
<<
", "
<<
A
.
mData
[
0
]
<<
std
::
endl
;
for
(
std
::
size_t
n
=
0
;
n
<
tensor_size
;
++
n
)
for
(
std
::
size_t
n
=
0
;
n
<
tensor_size
;
++
n
)
{
{
B
.
mData
[
n
]
=
functor
(
A
.
mData
[
n
]);
B
.
mData
[
n
]
=
functor
(
A
.
mData
[
n
]);
...
@@ -318,7 +324,8 @@ int main(int argc, char* argv[])
...
@@ -318,7 +324,8 @@ int main(int argc, char* argv[])
// alloc work space
// alloc work space
size_t
bwd_weight_workspace_size
=
conv
->
GetWorkSpaceSize
(
argument
.
get
());
size_t
bwd_weight_workspace_size
=
conv
->
GetWorkSpaceSize
(
argument
.
get
());
float
ave_time
=
0.
f
;
float
conv_ave_time
=
0.
f
;
float
type_convert_ave_time
=
0.
f
;
DeviceMem
wei_work_space_device_buf
(
bwd_weight_workspace_size
);
DeviceMem
wei_work_space_device_buf
(
bwd_weight_workspace_size
);
wei_work_space_device_buf
.
SetZero
();
wei_work_space_device_buf
.
SetZero
();
...
@@ -349,17 +356,42 @@ int main(int argc, char* argv[])
...
@@ -349,17 +356,42 @@ int main(int argc, char* argv[])
return
1
;
return
1
;
}
}
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
conv_ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
// do type convert
auto
type_convert
=
DeviceUnaryElementwiseTypeConvertInstance
{};
auto
type_convert_invoker
=
type_convert
.
MakeInvokerPointer
();
int
tensor_size
=
std
::
accumulate
(
filter_dims
.
begin
(),
filter_dims
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
auto
type_convert_argument
=
type_convert
.
MakeArgumentPointer
(
wei_work_space_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
{
tensor_size
},
{
1
},
{
1
},
UnaryTypeConvert
{});
if
(
!
type_convert
.
IsSupportedArgument
(
type_convert_argument
.
get
()))
{
std
::
cout
<<
"wrong! device_type_convert with the specified compilation parameters does "
"not support this convert problem"
<<
std
::
endl
;
return
1
;
}
type_convert_ave_time
=
type_convert_invoker
->
Run
(
type_convert_argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
// type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
// host code to check if conv give me a right result
// host code to check if conv give me a right result
Tensor
<
AccDataType
>
wei_k_c_y_x_device_result_fp32
(
// Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
ck
::
utils
::
conv
::
get_filters_host_tensor_descriptor
(
filter_dims
,
num_dim_spatial
));
// ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
wei_work_space_device_buf
.
FromDevice
(
wei_k_c_y_x_device_result_fp32
.
mData
.
data
());
// wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
const
auto
type_cvt_functor
=
[
&
](
AccDataType
a
)
{
// const auto type_cvt_functor = [&](AccDataType a) {
return
ck
::
type_convert
<
WeiDataType
,
AccDataType
>
(
a
);
// return ck::type_convert<WeiDataType, AccDataType>(a);
};
// };
host_elementwise
<
Tensor
<
WeiDataType
>
,
Tensor
<
AccDataType
>
,
decltype
(
type_cvt_functor
)
>
(
// host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
wei_k_c_y_x_device_result
,
wei_k_c_y_x_device_result_fp32
,
filter_dims
,
type_cvt_functor
);
// wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
// type_cvt_functor);
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
...
@@ -371,12 +403,12 @@ int main(int argc, char* argv[])
...
@@ -371,12 +403,12 @@ int main(int argc, char* argv[])
params
.
filter_spatial_lengths_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
conv_
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
conv_
ave_time
;
std
::
cout
<<
"Perf:
"
<<
ave_time
<<
" ms,
"
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
std
::
cout
<<
"Perf:
conv: "
<<
conv_
ave_time
<<
" ms,
type_convert: "
<<
type_convert_ave_time
<<
std
::
endl
;
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
do_verification
)
if
(
do_verification
)
{
{
...
@@ -396,7 +428,7 @@ int main(int argc, char* argv[])
...
@@ -396,7 +428,7 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
//
wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
wei_device_buf
.
FromDevice
(
wei_k_c_y_x_device_result
.
mData
.
data
());
if
(
do_log
)
if
(
do_log
)
{
{
...
...
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
View file @
c6b52884
...
@@ -12,7 +12,6 @@ namespace device {
...
@@ -12,7 +12,6 @@ namespace device {
template
<
typename
ADataType
,
template
<
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
ComputeDataType
,
typename
ElementwiseFunctor
,
typename
ElementwiseFunctor
,
index_t
Dim
,
index_t
Dim
,
index_t
ScalarPerVector
>
index_t
ScalarPerVector
>
...
@@ -62,11 +61,10 @@ struct DeviceUnaryElementwise : public BaseOperator
...
@@ -62,11 +61,10 @@ struct DeviceUnaryElementwise : public BaseOperator
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridwiseBinEltwise
=
GridwiseUnaryElementwise_1D
<
ADataType
,
using
GridwiseBinEltwise
=
GridwiseUnaryElementwise_1D
<
ADataType
,
BDataType
,
BDataType
,
ComputeDataType
,
GridDesc_M0
,
GridDesc_M0
,
ElementwiseFunctor
,
ElementwiseFunctor
,
ScalarPerVector
>
;
ScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
...
@@ -81,7 +79,7 @@ struct DeviceUnaryElementwise : public BaseOperator
...
@@ -81,7 +79,7 @@ struct DeviceUnaryElementwise : public BaseOperator
shape_
(
shape
),
shape_
(
shape
),
functor_
(
functor
),
functor_
(
functor
),
blockSize_
(
256
),
blockSize_
(
256
),
gridSize_
(
1
20
)
// FIXME - Calculate the grid size by number of CU in the future
gridSize_
(
2
4
0
)
// FIXME - Calculate the grid size by number of CU in the future
{
{
a_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_a
,
gridSize_
,
blockSize_
);
a_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_a
,
gridSize_
,
blockSize_
);
b_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_b
,
gridSize_
,
blockSize_
);
b_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_b
,
gridSize_
,
blockSize_
);
...
@@ -102,10 +100,10 @@ struct DeviceUnaryElementwise : public BaseOperator
...
@@ -102,10 +100,10 @@ struct DeviceUnaryElementwise : public BaseOperator
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
const
auto
kernel
=
kernel_unary_elementwise_1d
<
GridwiseBinEltwise
,
const
auto
kernel
=
kernel_unary_elementwise_1d
<
GridwiseBinEltwise
,
ADataType
,
ADataType
,
BDataType
,
BDataType
,
GridDesc_M0
,
GridDesc_M0
,
ElementwiseFunctor
>
;
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
...
...
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
c6b52884
...
@@ -335,11 +335,20 @@ struct UnaryTypeConvert;
...
@@ -335,11 +335,20 @@ struct UnaryTypeConvert;
template
<
>
template
<
>
struct
UnaryTypeConvert
<
float
,
ck
::
bhalf_t
>
struct
UnaryTypeConvert
<
float
,
ck
::
bhalf_t
>
{
{
__host__
__device__
UnaryTypeConvert
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
ck
::
bhalf_t
&
x
)
const
{
__host__
__device__
void
operator
()(
float
&
y
,
ck
::
bhalf_t
&
x
)
const
{
y
=
ck
::
type_convert
<
float
,
ck
::
bhalf_t
>
(
x
);
};
y
=
ck
::
type_convert
<
float
,
ck
::
bhalf_t
>
(
x
);
};
};
};
template
<
>
struct
UnaryTypeConvert
<
ck
::
bhalf_t
,
float
>
{
__host__
__device__
void
operator
()(
ck
::
bhalf_t
&
y
,
float
&
x
)
const
{
y
=
ck
::
type_convert
<
ck
::
bhalf_t
,
float
>
(
x
);
};
};
}
// namespace element_wise
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
View file @
c6b52884
...
@@ -13,21 +13,16 @@ template <typename GridwiseUEltwise,
...
@@ -13,21 +13,16 @@ template <typename GridwiseUEltwise,
typename
GridDesc_M0
,
typename
GridDesc_M0
,
typename
ElementwiseFunctor
>
typename
ElementwiseFunctor
>
__global__
void
kernel_unary_elementwise_1d
(
const
ADataType
*
__restrict__
p_a_global
,
__global__
void
kernel_unary_elementwise_1d
(
const
ADataType
*
__restrict__
p_a_global
,
BDataType
*
__restrict__
p_b_global
,
BDataType
*
__restrict__
p_b_global
,
const
GridDesc_M0
a_grid_desc_m0
,
const
GridDesc_M0
a_grid_desc_m0
,
const
GridDesc_M0
b_grid_desc_m0
,
const
GridDesc_M0
b_grid_desc_m0
,
const
ElementwiseFunctor
functor
)
const
ElementwiseFunctor
functor
)
{
{
GridwiseUEltwise
::
Run
(
p_a_global
,
GridwiseUEltwise
::
Run
(
p_a_global
,
p_b_global
,
a_grid_desc_m0
,
b_grid_desc_m0
,
functor
);
p_b_global
,
a_grid_desc_m0
,
b_grid_desc_m0
,
functor
);
}
}
template
<
typename
ADataType
,
template
<
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
ComputeDataType
,
typename
GridDesc_M0
,
typename
GridDesc_M0
,
typename
ElementwiseFunctor
,
typename
ElementwiseFunctor
,
index_t
ScalarPerVector
>
index_t
ScalarPerVector
>
...
@@ -46,11 +41,9 @@ struct GridwiseUnaryElementwise_1D
...
@@ -46,11 +41,9 @@ struct GridwiseUnaryElementwise_1D
}
}
__device__
static
void
Run
(
const
ADataType
*
__restrict__
p_a_global
,
__device__
static
void
Run
(
const
ADataType
*
__restrict__
p_a_global
,
const
BDataType
*
__restrict__
p_b_global
,
BDataType
*
__restrict__
p_b_global
,
CDataType
*
__restrict__
p_c_global
,
const
GridDesc_M0
a_grid_desc_m0
,
const
GridDesc_M0
a_grid_desc_m0
,
const
GridDesc_M0
b_grid_desc_m0
,
const
GridDesc_M0
b_grid_desc_m0
,
const
GridDesc_M0
c_grid_desc_m0
,
const
ElementwiseFunctor
functor
)
const
ElementwiseFunctor
functor
)
{
{
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
...
@@ -58,14 +51,14 @@ struct GridwiseUnaryElementwise_1D
...
@@ -58,14 +51,14 @@ struct GridwiseUnaryElementwise_1D
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_global
,
b_grid_desc_m0
.
GetElementSpaceSize
());
p_b_global
,
b_grid_desc_m0
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
Compute
DataType
,
ScalarPerVector
,
true
>
a_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
A
DataType
,
ScalarPerVector
,
true
>
a_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
Compute
DataType
,
ScalarPerVector
,
true
>
b_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
B
DataType
,
ScalarPerVector
,
true
>
b_thread_buf
;
const
auto
thread_store_global_offset
=
CalculateElementwiseIndex
();
const
auto
thread_store_global_offset
=
CalculateElementwiseIndex
();
auto
a_global_load
=
auto
a_global_load
=
ThreadwiseTensorSliceTransfer_v2
<
ADataType
,
ThreadwiseTensorSliceTransfer_v2
<
ADataType
,
Compute
DataType
,
A
DataType
,
GridDesc_M0
,
GridDesc_M0
,
decltype
(
thread_desc_m0
),
decltype
(
thread_desc_m0
),
Sequence
<
ScalarPerVector
>
,
// SliceLengths
Sequence
<
ScalarPerVector
>
,
// SliceLengths
...
@@ -76,7 +69,7 @@ struct GridwiseUnaryElementwise_1D
...
@@ -76,7 +69,7 @@ struct GridwiseUnaryElementwise_1D
false
>
{
a_grid_desc_m0
,
thread_store_global_offset
};
false
>
{
a_grid_desc_m0
,
thread_store_global_offset
};
auto
b_global_write
=
auto
b_global_write
=
ThreadwiseTensorSliceTransfer_v1r3
<
Compute
DataType
,
ThreadwiseTensorSliceTransfer_v1r3
<
B
DataType
,
BDataType
,
BDataType
,
decltype
(
thread_desc_m0
),
decltype
(
thread_desc_m0
),
GridDesc_M0
,
GridDesc_M0
,
...
@@ -92,7 +85,7 @@ struct GridwiseUnaryElementwise_1D
...
@@ -92,7 +85,7 @@ struct GridwiseUnaryElementwise_1D
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
auto
m0
=
c
_grid_desc_m0
.
GetLength
(
I0
);
const
auto
m0
=
b
_grid_desc_m0
.
GetLength
(
I0
);
const
index_t
loop_step
=
blockPerGrid
*
blockSize
*
ScalarPerVector
;
const
index_t
loop_step
=
blockPerGrid
*
blockSize
*
ScalarPerVector
;
const
auto
loop_step_index
=
make_multi_index
(
loop_step
);
const
auto
loop_step_index
=
make_multi_index
(
loop_step
);
...
@@ -105,8 +98,7 @@ struct GridwiseUnaryElementwise_1D
...
@@ -105,8 +98,7 @@ struct GridwiseUnaryElementwise_1D
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
m
)
{
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
m
)
{
constexpr
auto
offset
=
thread_desc_m0
.
CalculateOffset
(
make_tuple
(
m
));
constexpr
auto
offset
=
thread_desc_m0
.
CalculateOffset
(
make_tuple
(
m
));
functor
(
b_thread_buf
(
Number
<
offset
>
{}),
functor
(
b_thread_buf
(
Number
<
offset
>
{}),
a_thread_buf
(
Number
<
offset
>
{}));
a_thread_buf
(
Number
<
offset
>
{}));
});
});
b_global_write
.
Run
(
thread_desc_m0
,
b_global_write
.
Run
(
thread_desc_m0
,
...
...
library/include/ck/library/host_tensor/device.hpp
View file @
c6b52884
...
@@ -111,6 +111,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -111,6 +111,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
}
}
else
else
{
{
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
grid_dim
.
x
,
grid_dim
.
y
,
grid_dim
.
z
,
block_dim
.
x
,
block_dim
.
y
,
block_dim
.
z
);
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
return
0
;
return
0
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment