Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5f0c56d0
"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "b94fd0b2279c6476c6e109e99dc5d0e6d8ce313c"
Commit
5f0c56d0
authored
Feb 23, 2021
by
Chao Liu
Browse files
refactor vector_type
parent
20fa988f
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
54 additions
and
244 deletions
+54
-244
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+3
-3
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+8
-8
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+38
-199
driver/include/device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+2
-0
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-0
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+0
-33
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
5f0c56d0
...
@@ -186,7 +186,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -186,7 +186,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
src_desc
.
CalculateOffset
(
to_multi_index
(
src_slice_origin_idx
)
+
dst_data_idx
+
src_desc
.
CalculateOffset
(
to_multi_index
(
src_slice_origin_idx
)
+
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
i
*
dst_scalar_step_in_vector
);
dst_vector
(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
dst_vector
.
Scalars
()
(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
});
});
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
...
@@ -837,7 +837,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -837,7 +837,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
src_data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
src_data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
.
Scalars
()
[
i
];
});
});
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
...
@@ -995,7 +995,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -995,7 +995,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
dst_vector
(
i
)
=
buffer_
[
Number
<
buffer_offset
>
{}];
dst_vector
.
Scalars
()
(
i
)
=
buffer_
[
Number
<
buffer_offset
>
{}];
});
});
using
DstVectorType
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryType
;
using
DstVectorType
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryType
;
...
...
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
View file @
5f0c56d0
...
@@ -188,28 +188,28 @@ __device__ float8_t amd_buffer_load_v2<float, 8>(const float* p_src_wave,
...
@@ -188,28 +188,28 @@ __device__ float8_t amd_buffer_load_v2<float, 8>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
vector_type
<
float
,
8
>
vector
;
vector_type
<
float
,
8
>
tmp
;
v
ector
.
Set
(
Number
<
4
>
{}
,
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
tmp
.
V
ector
s
(
Number
<
4
>
{}
)(
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
,
0
,
0
);
v
ector
.
Set
(
Number
<
4
>
{}
,
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
tmp
.
V
ector
s
(
Number
<
4
>
{}
)(
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_wave_buffer_resource
.
data
,
src_addr_shift
+
src_thread_addr_offset
+
4
*
sizeof
(
float
),
src_addr_shift
+
src_thread_addr_offset
+
4
*
sizeof
(
float
),
0
,
0
,
0
);
0
);
return
vector
.
Get
(
Number
<
8
>
{},
Number
<
0
>
{}
);
return
tmp
.
Vector
(
);
#else
#else
vector_type
<
float
,
8
>
vector
;
vector_type
<
float
,
8
>
tmp
;
v
ector
.
Set
(
Number
<
4
>
{}
,
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
tmp
.
V
ector
s
(
Number
<
4
>
{}
)(
Number
<
0
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
,
0
,
0
);
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
,
0
,
0
);
v
ector
.
Set
(
Number
<
4
>
{}
,
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
tmp
.
V
ector
s
(
Number
<
4
>
{}
)(
Number
<
1
>
{})
=
__llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
+
4
*
sizeof
(
float
),
0
,
0
);
src_wave_buffer_resource
.
data
,
src_thread_addr_offset
+
4
*
sizeof
(
float
),
0
,
0
);
return
src_thread_data_valid
?
vector
.
Get
(
Number
<
8
>
{},
Number
<
0
>
{}
)
:
float8_t
(
0
);
return
src_thread_data_valid
?
tmp
.
Vector
(
)
:
float8_t
(
0
);
#endif
#endif
}
}
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
5f0c56d0
...
@@ -177,47 +177,27 @@ struct vector_type<T, 1>
...
@@ -177,47 +177,27 @@ struct vector_type<T, 1>
{
{
using MemoryType = T;
using MemoryType = T;
T data_;
union
{
T d1_;
StaticallyIndexedArray<T, 1> d1x1_;
} data_;
__host__ __device__ constexpr vector_type() : data_{T{0}} {}
__host__ __device__ constexpr vector_type() : data_{T{0}} {}
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ constexpr const auto& Vector() const { return data_; }
__host__ __device__ constexpr const auto& Vector() const { return data_.d1_; }
__host__ __device__ constexpr auto& Vector() { return data_; }
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_;
__host__ __device__ constexpr auto& Vector() { return data_.d1_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_;
}
template <index_t I>
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x1_; }
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_;
__host__ __device__ constexpr auto& Scalars() { return data_.d1x1_; }
}
template <index_t I>
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x1_; }
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I == 0, "wrong!");
return data_;
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x1_; }
}
};
};
template <typename T>
template <typename T>
...
@@ -232,6 +212,7 @@ struct vector_type<T, 2>
...
@@ -232,6 +212,7 @@ struct vector_type<T, 2>
{
{
d2_t d2_;
d2_t d2_;
StaticallyIndexedArray<d1_t, 2> d1x2_;
StaticallyIndexedArray<d1_t, 2> d1x2_;
StaticallyIndexedArray<d2_t, 1> d2x1_;
} data_;
} data_;
__host__ __device__ constexpr vector_type() : data_{d2_t{0}} {}
__host__ __device__ constexpr vector_type() : data_{d2_t{0}} {}
...
@@ -242,53 +223,17 @@ struct vector_type<T, 2>
...
@@ -242,53 +223,17 @@ struct vector_type<T, 2>
__host__ __device__ constexpr auto& Vector() { return data_.d2_; }
__host__ __device__ constexpr auto& Vector() { return data_.d2_; }
template <index_t I>
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x2_; }
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_[i];
}
template <index_t I>
__host__ __device__ constexpr const auto& Get(Number<2>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_.d2_;
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_(i);
__host__ __device__ constexpr auto& Scalars() { return data_.d1x2_; }
}
template <index_t I>
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x2_; }
__host__ __device__ constexpr auto& Set(Number<2>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_.d2_;
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x1_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x2_; }
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_[Number<I>{}];
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x1_; }
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d1x2_(Number<I>{});
}
};
};
template <typename T>
template <typename T>
...
@@ -305,6 +250,7 @@ struct vector_type<T, 4>
...
@@ -305,6 +250,7 @@ struct vector_type<T, 4>
d4_t d4_;
d4_t d4_;
StaticallyIndexedArray<d1_t, 4> d1x4_;
StaticallyIndexedArray<d1_t, 4> d1x4_;
StaticallyIndexedArray<d2_t, 2> d2x2_;
StaticallyIndexedArray<d2_t, 2> d2x2_;
StaticallyIndexedArray<d4_t, 1> d4x1_;
} data_;
} data_;
__host__ __device__ constexpr vector_type() : data_{d4_t{0}} {}
__host__ __device__ constexpr vector_type() : data_{d4_t{0}} {}
...
@@ -315,69 +261,21 @@ struct vector_type<T, 4>
...
@@ -315,69 +261,21 @@ struct vector_type<T, 4>
__host__ __device__ constexpr auto& Vector() { return data_.d4_; }
__host__ __device__ constexpr auto& Vector() { return data_.d4_; }
template <index_t I>
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x4_; }
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{
static_assert(I >= 0 && I < 4, "wrong!");
return data_.d1x4_[i];
__host__ __device__ constexpr auto& Scalars() { return data_.d1x4_; }
}
template <index_t I>
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x4_; }
__host__ __device__ constexpr const auto& Get(Number<2>, Number<I> i) const
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d2x2_[i];
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x2_; }
}
template <index_t I>
__host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x1_; }
__host__ __device__ constexpr const auto& Get(Number<4>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_.d4_;
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x4_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x2_; }
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{
static_assert(I >= 0 && I < 4, "wrong!");
return data_.d1x4_(i);
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
{
static_assert(I >= 0 && I < 3, "wrong!");
return data_.d2x2_(i);
__host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x1_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<4>, Number<I>)
{
static_assert(I == 0, "wrong!");
return data_.d4_;
}
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
static_assert(I >= 0 && I < 4, "wrong!");
return data_.d1x4_[Number<I>{}];
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I >= 0 && I < 4, "wrong!");
return data_.d1x4_(Number<I>{});
}
};
};
template <typename T>
template <typename T>
...
@@ -396,6 +294,7 @@ struct vector_type<T, 8>
...
@@ -396,6 +294,7 @@ struct vector_type<T, 8>
StaticallyIndexedArray<d1_t, 8> d1x8_;
StaticallyIndexedArray<d1_t, 8> d1x8_;
StaticallyIndexedArray<d2_t, 4> d2x4_;
StaticallyIndexedArray<d2_t, 4> d2x4_;
StaticallyIndexedArray<d4_t, 2> d4x2_;
StaticallyIndexedArray<d4_t, 2> d4x2_;
StaticallyIndexedArray<d8_t, 1> d8x1_;
} data_;
} data_;
__host__ __device__ constexpr vector_type() : data_{d8_t{0}} {}
__host__ __device__ constexpr vector_type() : data_{d8_t{0}} {}
...
@@ -406,85 +305,25 @@ struct vector_type<T, 8>
...
@@ -406,85 +305,25 @@ struct vector_type<T, 8>
__host__ __device__ constexpr auto& Vector() { return data_.d8_; }
__host__ __device__ constexpr auto& Vector() { return data_.d8_; }
template <index_t I>
__host__ __device__ constexpr const auto& Scalars() const { return data_.d1x8_; }
__host__ __device__ constexpr const auto& Get(Number<1>, Number<I> i) const
{
static_assert(I >= 0 && I < 8, "wrong!");
return data_.d1x8_[i];
__host__ __device__ constexpr auto& Scalars() { return data_.d1x8_; }
}
template <index_t I>
__host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x8_; }
__host__ __device__ constexpr const auto& Get(Number<2>, Number<I> i) const
{
static_assert(I >= 0 && I < 4, "wrong!");
return data_.d2x4_[i];
__host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x4_; }
}
template <index_t I>
__host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x2_; }
__host__ __device__ constexpr const auto& Get(Number<4>, Number<I> i) const
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d4x2_[i];
__host__ __device__ constexpr const auto& Vectors(Number<8>) const { return data_.d8x1_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x8_; }
__host__ __device__ constexpr const auto& Get(Number<8>, Number<I>) const
{
static_assert(I == 0, "wrong!");
return data_.d8_;
__host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x4_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x2_; }
__host__ __device__ constexpr auto& Set(Number<1>, Number<I> i)
{
static_assert(I >= 0 && I < 8, "wrong!");
return data_.d1x8_(i);
__host__ __device__ constexpr auto& Vectors(Number<8>) { return data_.d8x1_; }
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<2>, Number<I> i)
{
static_assert(I >= 0 && I < 4, "wrong!");
return data_.d2x4_(i);
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<4>, Number<I> i)
{
static_assert(I >= 0 && I < 2, "wrong!");
return data_.d4x2_(i);
}
template <index_t I>
__host__ __device__ constexpr auto& Set(Number<8>, Number<I> i)
{
static_assert(I == 0, "wrong!");
return data_.d8_;
}
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
static_assert(I >= 0 && I < 8, "wrong!");
return data_.d1x8_[Number<I>{}];
}
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
static_assert(I >= 0 && I < 8, "wrong!");
return data_.d1x8_(Number<I>{});
}
};
};
// data type conversion
// data type conversion
...
...
driver/include/device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
5f0c56d0
...
@@ -25,6 +25,8 @@ void device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -25,6 +25,8 @@ void device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
RightPads
,
RightPads
,
ck
::
index_t
nrepeat
)
ck
::
index_t
nrepeat
)
{
{
std
::
cout
<<
"device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw"
<<
std
::
endl
;
using
namespace
ck
;
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
...
...
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
5f0c56d0
...
@@ -24,6 +24,8 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -24,6 +24,8 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
InRightPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
ck
::
index_t
nrepeat
)
{
{
std
::
cout
<<
"device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"
<<
std
::
endl
;
using
namespace
ck
;
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
5f0c56d0
...
@@ -110,39 +110,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(InDesc
...
@@ -110,39 +110,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(InDesc
out_nhwk_device_buf
.
ToDevice
(
out_nhwk
.
mData
.
data
());
out_nhwk_device_buf
.
ToDevice
(
out_nhwk
.
mData
.
data
());
#if 0
#if 0
// cdata = 64, BlockSize = 128, 32x256x8
constexpr index_t BlockSize = 128;
constexpr index_t GemmMPerBlock = 32;
constexpr index_t GemmNPerBlock = 256;
constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmMPerThread = 4;
constexpr index_t GemmNPerThread = 4;
constexpr index_t GemmKPerThread = 1;
constexpr index_t GemmMLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 2;
constexpr index_t GemmNLevel1Cluster = 16;
constexpr index_t ThreadGemmDataPerReadM = 4;
constexpr index_t ThreadGemmDataPerReadN = 4;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<2, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 32>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<8, 2>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 128>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif
0
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
...
...
driver/src/conv_driver.cpp
View file @
5f0c56d0
...
@@ -616,7 +616,7 @@ int main(int argc, char* argv[])
...
@@ -616,7 +616,7 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif
0
#elif
1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment