Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f3099df8
Commit
f3099df8
authored
Sep 18, 2020
by
Chao Liu
Browse files
trying to solve scratch mem issue
parent
0413a020
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
128 additions
and
87 deletions
+128
-87
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+11
-9
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+33
-30
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
...clude/tensor_description/dynamic_tensor_descriptor_v2.hpp
+4
-3
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+1
-1
driver/CMakeLists.txt
driver/CMakeLists.txt
+3
-0
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+27
-27
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+12
-12
driver/src/try.cpp
driver/src/try.cpp
+29
-0
script/cmake-rocm3.7.sh
script/cmake-rocm3.7.sh
+5
-2
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
f3099df8
...
...
@@ -72,11 +72,12 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{{
C
,
Y
,
X
}},
DynamicMerge
<
3
>
{{
N
,
Ho
,
Wo
}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
C
,
Y
,
X
}}},
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
N
,
Ho
,
Wo
}}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
in_gemmk_gemmn_global_desc
);
}
...
...
@@ -146,7 +147,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{{
C
,
Y
,
X
}},
DynamicMerge
<
3
>
{{
N
,
Ho
,
Wo
}}),
make_tuple
(
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
C
,
Y
,
X
}}},
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
N
,
Ho
,
Wo
}}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -632,7 +634,7 @@ struct DummyDynamicTransform_1
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
{
1
,
0
}
}
;
in_gemmk_gemmn_coord
+=
gemmk1_gemmn0
;
...
...
@@ -793,8 +795,8 @@ struct DummyDynamicTransform_2
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
Dynamic
LeftPad
{
Hi
,
InLeftPadH
},
Dynamic
LeftPad
{
Wi
,
InLeftPadW
}),
Dynamic
PassThrough
{
Hi
},
Dynamic
PassThrough
{
Wi
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
f3099df8
...
...
@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace
::
Vgpr
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
MultiIndex
<
4
>
{
{
0
,
0
,
b_block_data_on_global
,
0
}
}
,
MultiIndex
<
4
>
{
{
0
,
0
,
0
,
0
}
}
);
// weight tensor
// global tensor in global memory, src of blockwise copy
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
f3099df8
...
...
@@ -10,20 +10,20 @@ struct DynamicPassThrough
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
__host__
__device__
explicit
constexpr
DynamicPassThrough
(
const
index_t
&
low_length
)
:
up_length
_
{
low_length
}
:
up_length
s_
{
{
low_length
}
}
{
}
__host__
__device__
explicit
constexpr
DynamicPassThrough
()
:
up_length
_
{
0
}
{}
__host__
__device__
explicit
constexpr
DynamicPassThrough
()
:
up_length
s_
{
{
0
}
}
{}
__host__
__device__
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
...
...
@@ -68,22 +68,22 @@ struct DynamicLeftPad
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
const
index_t
left_pad_
;
__host__
__device__
explicit
constexpr
DynamicLeftPad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
)
:
up_length
_
{
low_length
+
left_pad
},
left_pad_
{
left_pad
}
:
up_length
s_
{
{
low_length
+
left_pad
}
}
,
left_pad_
{
left_pad
}
{
}
__host__
__device__
explicit
constexpr
DynamicLeftPad
()
:
up_length
_
{
0
},
left_pad_
{
0
}
{}
__host__
__device__
explicit
constexpr
DynamicLeftPad
()
:
up_length
s_
{
{
0
}
}
,
left_pad_
{
0
}
{}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -96,10 +96,11 @@ struct DynamicLeftPad
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
...
...
@@ -129,18 +130,18 @@ struct DynamicRightPad
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
const
index_t
low_length_
;
const
index_t
right_pad_
;
__host__
__device__
explicit
constexpr
DynamicRightPad
(
const
index_t
&
low_length
,
const
index_t
&
right_pad
)
:
up_length
_
{
low_length
+
right_pad
},
low_length_
{
low_length
},
right_pad_
{
right_pad
}
:
up_length
s_
{
{
low_length
+
right_pad
}
}
,
low_length_
{
low_length
},
right_pad_
{
right_pad
}
{
}
__host__
__device__
explicit
constexpr
DynamicRightPad
()
:
up_length
_
{
0
},
low_length_
{
0
},
right_pad_
{
0
}
:
up_length
s_
{
{
0
}
}
,
low_length_
{
0
},
right_pad_
{
0
}
{
}
...
...
@@ -148,10 +149,11 @@ struct DynamicRightPad
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
__host__
__device__
static
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
{
static_assert
(
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
...
...
@@ -160,10 +162,11 @@ struct DynamicRightPad
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
...
...
@@ -216,7 +219,7 @@ struct DynamicEmbed
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
NDimUp
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
up_lengths_
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -276,13 +279,13 @@ struct DynamicMerge
const
LowerIndex
low_lengths_
;
const
LowerIndex
low_lengths_scan_
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
__host__
__device__
explicit
constexpr
DynamicMerge
(
const
LowerIndex
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
reverse_exclusive_scan_on_array
(
low_lengths
,
math
::
multiplies
<
index_t
>
{},
index_t
{
1
})},
up_length
_
{
reduce_on_array
(
low_lengths
,
math
::
multiplies
<
index_t
>
(),
1
)}
up_length
s_
{
{
reduce_on_array
(
low_lengths
,
math
::
multiplies
<
index_t
>
(),
index_t
{
1
}
)}
}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
...
...
@@ -290,7 +293,7 @@ struct DynamicMerge
__host__
__device__
explicit
constexpr
DynamicMerge
()
:
low_lengths_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
low_lengths_scan_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
up_length
_
{
0
}
up_length
s_
{
{
0
}
}
{
}
...
...
@@ -298,7 +301,7 @@ struct DynamicMerge
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -444,7 +447,7 @@ struct DynamicUnMerge
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
NDimUp
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
up_lengths_
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -500,7 +503,7 @@ struct DynamicFreeze
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
0
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
UpperIndex
{};
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
View file @
f3099df8
...
...
@@ -150,7 +150,7 @@ struct DynamicTensorDescriptor_v2
index_t
element_space_size
)
{
// zero initialization
HiddenIndex
hidden_lengths
{
0
};
HiddenIndex
hidden_lengths
{
{
0
}
};
// this is the orignal tensor element space size
hidden_lengths
(
0
)
=
element_space_size
;
...
...
@@ -319,7 +319,8 @@ struct lambda_get_up_dim_num
template
<
typename
I
>
__host__
__device__
constexpr
auto
operator
()(
I
)
const
{
return
Number
<
NewTransforms
{}.
At
(
I
{}).
GetNumOfUpperDimension
()
>
{};
using
Tran
=
remove_reference_t
<
decltype
(
NewTransforms
{}.
At
(
I
{}))
>
;
return
Number
<
Tran
::
GetNumOfUpperDimension
()
>
{};
}
};
...
...
@@ -488,7 +489,7 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten
using
HiddenIndex
=
MultiIndex
<
ndim_hidden
>
;
// this is what needs to be calculated
auto
idx_diff_hidden
=
HiddenIndex
{
0
};
auto
idx_diff_hidden
=
HiddenIndex
{
{
0
}
};
// initialize visible index diff
// idx_diff_hidden_pick_visible contains reference to idx_diff_hidden
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
f3099df8
...
...
@@ -364,7 +364,7 @@ struct UnMerge
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
{
0
};
LowerIndex
idx_low
{
{
0
}
};
constexpr
auto
pseudo_up_strides
=
reverse_inclusive_scan_sequence
(
...
...
@@ -425,7 +425,7 @@ struct Embed
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
=
{
Coefficients
{}[
nDimUp
]};
LowerIndex
idx_low
{
{
Coefficients
{}[
nDimUp
]}
}
;
for
(
index_t
i
=
0
;
i
<
nDimUp
;
++
i
)
{
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
f3099df8
...
...
@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
{
idx_1d
}
}
);
}
};
...
...
driver/CMakeLists.txt
View file @
f3099df8
...
...
@@ -17,6 +17,7 @@ install(TARGETS host LIBRARY DESTINATION lib)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
set
(
CONV_SOURCE src/conv_driver.cpp
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp
)
set
(
TRY_SOURCE src/try.cpp
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
set
(
CONV_SOURCE src/conv_driver.cu
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu
)
...
...
@@ -24,6 +25,8 @@ endif()
add_executable
(
conv_driver
${
CONV_SOURCE
}
)
add_executable
(
conv_bwd_data_driver
${
CONV_BWD_DATA_SOURCE
}
)
add_executable
(
try
${
TRY_SOURCE
}
)
target_link_libraries
(
conv_driver PRIVATE host
)
target_link_libraries
(
conv_bwd_data_driver PRIVATE host
)
target_link_libraries
(
try PRIVATE host
)
driver/include/device_dummy_dynamic_transform.hpp
View file @
f3099df8
...
...
@@ -12,17 +12,17 @@ template <class T,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
void
device_dummy_dynamic_transform
_1
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -52,11 +52,11 @@ void device_dummy_dynamic_transform(InDesc,
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
0
});
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
{
0
,
0
}
}
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
{
1
,
0
}
}
;
printf
(
"iter %d
\n
"
,
iter
);
...
...
@@ -147,17 +147,17 @@ template <class T,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform_
v
2
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
void
device_dummy_dynamic_transform_2
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
...
...
@@ -187,10 +187,10 @@ void device_dummy_dynamic_transform_v2(InDesc,
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
0
});
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
{
0
,
0
}
}
);
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
1
});
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
{
0
,
1
}
}
);
for
(
index_t
iter
=
0
;
iter
<
20
;
++
iter
)
{
...
...
driver/src/conv_driver.cpp
View file @
f3099df8
...
...
@@ -585,18 +585,18 @@ int main(int argc, char* argv[])
RightPads
{},
nrepeat
);
#elif 0
device_dummy_dynamic_transform
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif
0
device_dummy_dynamic_transform
_1
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif
1
device_dummy_dynamic_transform_2
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
driver/src/try.cpp
0 → 100644
View file @
f3099df8
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
auto
idx1
=
std
::
array
<
index_t
,
2
>
{{
1
,
0
}};
auto
idx2
=
Array
<
index_t
,
2
>
{{
1
,
0
}};
auto
idx3
=
MultiIndex
<
2
>
{{
1
,
0
}};
auto
idx0
=
MultiIndex
<
2
>
{{
1
,
0
}};
print_array
(
"idx2"
,
idx2
);
print_array
(
"idx3"
,
idx2
);
}
script/cmake-rocm3.
5
.sh
→
script/cmake-rocm3.
7
.sh
View file @
f3099df8
...
...
@@ -8,14 +8,17 @@ MY_PROJECT_INSTALL=../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Debug
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -mllvm
--amdgpu-enable-global-sgpr-addr -mllvm
--amdgpu-spill-vgpr-to-agpr=0"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0
-save-temps=
$CWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment