Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
2c9b8c24
"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "85c1ff1ceae66003a06d445126e17af4bc683ae4"
Commit
2c9b8c24
authored
Mar 12, 2019
by
Chao Liu
Browse files
update hip build
parent
0c88a3d8
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
97 additions
and
82 deletions
+97
-82
driver/driver.hip.cpp
driver/driver.hip.cpp
+7
-7
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+5
-4
src/include/blockwise_direct_convolution.hip.hpp
src/include/blockwise_direct_convolution.hip.hpp
+15
-12
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+14
-11
src/include/common.hip.hpp
src/include/common.hip.hpp
+3
-1
src/include/config.h.in
src/include/config.h.in
+0
-1
src/include/gridwise_direct_convolution_1.hip.hpp
src/include/gridwise_direct_convolution_1.hip.hpp
+10
-9
src/include/gridwise_direct_convolution_2.hip.hpp
src/include/gridwise_direct_convolution_2.hip.hpp
+20
-16
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+15
-13
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+5
-4
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...mm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+2
-2
src/include/tensor.hpp
src/include/tensor.hpp
+1
-2
No files found.
driver/driver.hip.cpp
View file @
2c9b8c24
...
@@ -47,7 +47,7 @@ struct GeneratorTensor_3
...
@@ -47,7 +47,7 @@ struct GeneratorTensor_3
std
::
initializer_list
<
std
::
size_t
>
ids
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
initializer_list
<
std
::
size_t
>
ids
=
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
vector
<
std
::
size_t
>
lens
(
sizeof
...(
Is
),
100
);
std
::
vector
<
std
::
size_t
>
lens
(
sizeof
...(
Is
),
100
);
std
::
vector
<
std
::
size_t
>
strides
(
sizeof
...(
Is
),
1
);
std
::
vector
<
std
::
size_t
>
strides
(
sizeof
...(
Is
),
1
);
std
::
partial_sum
(
lens
.
rbegin
(),
lens
.
rbegin
()
+
(
sizeof
...(
Is
)
-
1
),
strides
.
rbegin
()
+
1
);
std
::
partial_sum
(
lens
.
rbegin
(),
lens
.
rbegin
()
+
(
sizeof
...(
Is
)
-
1
),
strides
.
rbegin
()
+
1
);
return
std
::
inner_product
(
ids
.
begin
(),
ids
.
end
(),
strides
.
begin
(),
std
::
size_t
(
0
))
+
1
;
return
std
::
inner_product
(
ids
.
begin
(),
ids
.
end
(),
strides
.
begin
(),
std
::
size_t
(
0
))
+
1
;
#endif
#endif
}
}
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
2c9b8c24
...
@@ -245,7 +245,8 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -245,7 +245,8 @@ struct BlockwiseChwnTensorCopyPadded
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
const
Float
*
p_src_tmp
=
const
Float
*
p_src_tmp
=
p_src
+
src_desc
.
Get1dIndex
(
c_block_data_begin
,
p_src
+
src_desc
.
Get1dIndex
(
c_block_data_begin
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
ho_block_data_begin
+
h_block_pad_low
)
-
h_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
(
wo_block_data_begin
+
w_block_pad_low
)
-
w_global_pad_low
,
n_block_data_begin
);
n_block_data_begin
);
...
...
src/include/blockwise_direct_convolution.hip.hpp
View file @
2c9b8c24
...
@@ -93,7 +93,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
...
@@ -93,7 +93,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
Float
p_out_thread
[
out_thread_desc
.
GetElementSpace
()];
threadwise_4d_tensor_copy
(
out_block_desc
,
threadwise_4d_tensor_copy
(
out_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
wo_thread_data_begin
),
...
@@ -107,7 +108,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
...
@@ -107,7 +108,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution
// threadwise convolution
threadwise_direct_convolution_2
(
threadwise_direct_convolution_2
(
in_thread_block_desc
,
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data_begin
,
c_thread_data_begin
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
...
@@ -122,7 +124,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
...
@@ -122,7 +124,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy
(
out_thread_desc
,
threadwise_4d_tensor_copy
(
out_thread_desc
,
p_out_thread
,
p_out_thread
,
out_block_desc
,
out_block_desc
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
k_thread_data_begin
,
k_thread_data_begin
,
ho_thread_data_begin
,
ho_thread_data_begin
,
wo_thread_data_begin
),
wo_thread_data_begin
),
...
...
src/include/blockwise_gemm.hip.hpp
View file @
2c9b8c24
...
@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr
unsigned
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
unsigned
MRepeat
=
MPerThread
/
MPerThreadSubC
;
constexpr
unsigned
NRepeat
=
NPerThread
/
NPerThreadSubC
;
constexpr
unsigned
NRepeat
=
NPerThread
/
NPerThreadSubC
;
// loop over k
// loop over k
#pragma unroll
#pragma unroll
for
(
unsigned
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
for
(
unsigned
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
{
{
// read first batch of A, B
// read first batch of A, B
// copy A-sub to form A
// copy A-sub to form A
#pragma unroll
#pragma unroll
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
for
(
unsigned
m_repeat
=
0
;
m_repeat
<
MRepeat
;
++
m_repeat
)
{
{
...
@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_sub_mtx
.
GetLengths
());
a_thread_sub_mtx
.
GetLengths
());
}
}
// copy B-sub to form B
// copy B-sub to form B
#pragma unroll
#pragma unroll
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
for
(
unsigned
n_repeat
=
0
;
n_repeat
<
NRepeat
;
++
n_repeat
)
{
{
...
@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_sub_mtx
.
GetLengths
());
b_thread_sub_mtx
.
GetLengths
());
}
}
// loop over batch
// loop over batch
#pragma unroll
#pragma unroll
for
(
unsigned
ib
=
0
;
ib
+
1
<
BatchPerThread
;
++
ib
)
for
(
unsigned
ib
=
0
;
ib
+
1
<
BatchPerThread
;
++
ib
)
{
{
...
@@ -557,7 +557,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
...
@@ -557,7 +557,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
{
{
threadwise_matrix_copy
(
threadwise_matrix_copy
(
c_thread_sub_mtx
,
c_thread_sub_mtx
,
p_c_thread
+
c_thread_sub_mtx
.
Get1dIndex
(
m_repeat
*
MPerLevel1Cluster
,
p_c_thread
+
c_thread_sub_mtx
.
Get1dIndex
(
m_repeat
*
MPerLevel1Cluster
,
n_repeat
*
NPerLevel1Cluster
),
n_repeat
*
NPerLevel1Cluster
),
c_block_mtx
,
c_block_mtx
,
p_c_block
+
p_c_block
+
...
@@ -656,7 +657,8 @@ struct BlockwiseGemmBlockABlockBThreadC
...
@@ -656,7 +657,8 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr
unsigned
NClusterWork
=
constexpr
unsigned
NClusterWork
=
(
NPerBlock
+
NPerThread
*
NThreadPerCluster
-
1
)
/
(
NPerThread
*
NThreadPerCluster
);
(
NPerBlock
+
NPerThread
*
NThreadPerCluster
-
1
)
/
(
NPerThread
*
NThreadPerCluster
);
static_assert
(
BlockSize
==
(
MClusterWork
*
MThreadPerCluster
)
*
static_assert
(
BlockSize
==
(
MClusterWork
*
MThreadPerCluster
)
*
(
NClusterWork
*
NThreadPerCluster
),
(
NClusterWork
*
NThreadPerCluster
),
"wrong! wrong BlockSize"
);
"wrong! wrong BlockSize"
);
...
@@ -1256,7 +1258,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -1256,7 +1258,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread
+
b_thread_mtx
.
Get1dIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
p_b_thread
+
b_thread_mtx
.
Get1dIndex
(
0
,
n_repeat
*
NPerThreadSubC
),
c_thread_sub_mtx
,
c_thread_sub_mtx
,
False
,
False
,
p_c_thread
+
c_thread_mtx
.
Get1dIndex
(
m_repeat
*
MPerThreadSubC
,
p_c_thread
+
c_thread_mtx
.
Get1dIndex
(
m_repeat
*
MPerThreadSubC
,
n_repeat
*
NPerThreadSubC
),
n_repeat
*
NPerThreadSubC
),
f_accum
);
f_accum
);
}
}
...
...
src/include/common.hip.hpp
View file @
2c9b8c24
...
@@ -65,7 +65,7 @@ struct vector_type<half_float::half, 8>
...
@@ -65,7 +65,7 @@ struct vector_type<half_float::half, 8>
};
};
#endif
#endif
#if
1
#if
0
template <>
template <>
struct vector_type<half, 1>
struct vector_type<half, 1>
{
{
...
@@ -139,6 +139,7 @@ struct Sequence
...
@@ -139,6 +139,7 @@ struct Sequence
}
}
};
};
#if DEVICE_BACKEND_CUDA
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
constexpr
T
max
(
T
a
,
T
b
)
__host__
__device__
constexpr
T
max
(
T
a
,
T
b
)
{
{
...
@@ -150,6 +151,7 @@ __host__ __device__ constexpr T min(T a, T b)
...
@@ -150,6 +151,7 @@ __host__ __device__ constexpr T min(T a, T b)
{
{
return
a
<
b
?
a
:
b
;
return
a
<
b
?
a
:
b
;
}
}
#endif
__host__
__device__
constexpr
unsigned
integer_divide_ceil
(
unsigned
a
,
unsigned
b
)
__host__
__device__
constexpr
unsigned
integer_divide_ceil
(
unsigned
a
,
unsigned
b
)
{
{
...
...
src/include/config.h.in
View file @
2c9b8c24
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
#if DEVICE_BACKEND_HIP
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "half.hpp"
#elif DEVICE_BACKEND_CUDA
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include "nvToolsExt.h"
...
...
src/include/gridwise_direct_convolution_1.hip.hpp
View file @
2c9b8c24
...
@@ -113,7 +113,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
...
@@ -113,7 +113,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin
+=
CPerBlock
)
c_block_work_begin
+=
CPerBlock
)
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
wi_block_work_begin
),
...
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
...
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
}
}
// copy output tensor from LDS to device mem
// copy output tensor from LDS to device mem
blockwise_out_copy
.
Run
(
p_out_block
,
blockwise_out_copy
.
Run
(
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
p_out_block
,
k_block_work_begin
,
p_out_global
+
ho_block_work_begin
,
out_global_desc
.
Get1dIndex
(
wo_block_work_begin
));
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
}
}
src/include/gridwise_direct_convolution_2.hip.hpp
View file @
2c9b8c24
...
@@ -139,7 +139,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -139,7 +139,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
wi_block_data_begin
),
...
@@ -158,7 +159,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -158,7 +159,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#if 1
#if 1
threadwise_direct_convolution_2
(
threadwise_direct_convolution_2
(
in_thread_block_desc
,
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
...
@@ -169,7 +171,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -169,7 +171,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0
#elif 0
threadwise_direct_convolution_3
(
threadwise_direct_convolution_3
(
in_thread_block_desc
,
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
hi_thread_data_begin
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wi_thread_data_begin
),
...
@@ -186,7 +189,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
...
@@ -186,7 +189,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc
,
out_thread_desc
,
p_out_thread
,
p_out_thread
,
out_global_desc
,
out_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
View file @
2c9b8c24
...
@@ -184,7 +184,8 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
...
@@ -184,7 +184,8 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero
(
out_khwn_thread_desc
,
p_out_thread
);
threadwise_4d_tensor_set_zero
(
out_khwn_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_begin
=
const
Float
*
p_in_global_block_begin
=
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
const
Float
*
p_wei_global_block_begin
=
...
@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
...
@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
}
}
}
}
// output: register to global mem,
// output: register to global mem,
#if 0
#if 0
const auto c_thread_mtx_begin =
const auto c_thread_mtx_begin =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
...
@@ -286,11 +287,12 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
...
@@ -286,11 +287,12 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
}
}
#endif
#endif
threadwise_8d_tensor_copy
(
out_8d_thread_desc
,
threadwise_8d_tensor_copy
(
out_8d_thread_desc
,
p_out_thread
,
p_out_thread
,
out_8d_global_desc
,
out_8d_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
p_out_global
+
k_block_data_begin
+
k_thread_data_begin
,
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
View file @
2c9b8c24
...
@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
...
@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_khwn_global_desc
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
2c9b8c24
src/include/tensor.hpp
View file @
2c9b8c24
...
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
...
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return
os
;
return
os
;
}
}
typedef
enum
typedef
enum
{
{
Half
=
0
,
Half
=
0
,
Float
=
1
,
Float
=
1
,
}
DataType_t
;
}
DataType_t
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment