Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f3099df8
"vscode:/vscode.git/clone" did not exist on "72b712a1eef4f3ba292b8712e2acf15519d61378"
Commit
f3099df8
authored
Sep 18, 2020
by
Chao Liu
Browse files
trying to solve scratch mem issue
parent
0413a020
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
128 additions
and
87 deletions
+128
-87
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+11
-9
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+33
-30
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
...clude/tensor_description/dynamic_tensor_descriptor_v2.hpp
+4
-3
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+1
-1
driver/CMakeLists.txt
driver/CMakeLists.txt
+3
-0
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+27
-27
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+12
-12
driver/src/try.cpp
driver/src/try.cpp
+29
-0
script/cmake-rocm3.7.sh
script/cmake-rocm3.7.sh
+5
-2
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
f3099df8
...
...
@@ -72,9 +72,10 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{{
C
,
Y
,
X
}},
DynamicMerge
<
3
>
{{
N
,
Ho
,
Wo
}}),
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
C
,
Y
,
X
}}},
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
N
,
Ho
,
Wo
}}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -146,7 +147,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const
auto
in_gemmk_gemmn_global_desc
=
transform_dynamic_tensor_descriptor_v2
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
DynamicMerge
<
3
>
{{
C
,
Y
,
X
}},
DynamicMerge
<
3
>
{{
N
,
Ho
,
Wo
}}),
make_tuple
(
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
C
,
Y
,
X
}}},
DynamicMerge
<
3
>
{
MultiIndex
<
3
>
{{
N
,
Ho
,
Wo
}}}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -632,7 +634,7 @@ struct DummyDynamicTransform_1
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
{
1
,
0
}
}
;
in_gemmk_gemmn_coord
+=
gemmk1_gemmn0
;
...
...
@@ -793,8 +795,8 @@ struct DummyDynamicTransform_2
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
Dynamic
LeftPad
{
Hi
,
InLeftPadH
},
Dynamic
LeftPad
{
Wi
,
InLeftPadW
}),
Dynamic
PassThrough
{
Hi
},
Dynamic
PassThrough
{
Wi
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
f3099df8
...
...
@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace
::
Vgpr
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
MultiIndex
<
4
>
{
{
0
,
0
,
b_block_data_on_global
,
0
}
}
,
MultiIndex
<
4
>
{
{
0
,
0
,
0
,
0
}
}
);
// weight tensor
// global tensor in global memory, src of blockwise copy
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
f3099df8
...
...
@@ -10,20 +10,20 @@ struct DynamicPassThrough
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
__host__
__device__
explicit
constexpr
DynamicPassThrough
(
const
index_t
&
low_length
)
:
up_length
_
{
low_length
}
:
up_length
s_
{
{
low_length
}
}
{
}
__host__
__device__
explicit
constexpr
DynamicPassThrough
()
:
up_length
_
{
0
}
{}
__host__
__device__
explicit
constexpr
DynamicPassThrough
()
:
up_length
s_
{
{
0
}
}
{}
__host__
__device__
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
...
...
@@ -68,22 +68,22 @@ struct DynamicLeftPad
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
const
index_t
left_pad_
;
__host__
__device__
explicit
constexpr
DynamicLeftPad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
)
:
up_length
_
{
low_length
+
left_pad
},
left_pad_
{
left_pad
}
:
up_length
s_
{
{
low_length
+
left_pad
}
}
,
left_pad_
{
left_pad
}
{
}
__host__
__device__
explicit
constexpr
DynamicLeftPad
()
:
up_length
_
{
0
},
left_pad_
{
0
}
{}
__host__
__device__
explicit
constexpr
DynamicLeftPad
()
:
up_length
s_
{
{
0
}
}
,
left_pad_
{
0
}
{}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -96,7 +96,8 @@ struct DynamicLeftPad
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
...
...
@@ -129,18 +130,18 @@ struct DynamicRightPad
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
const
index_t
low_length_
;
const
index_t
right_pad_
;
__host__
__device__
explicit
constexpr
DynamicRightPad
(
const
index_t
&
low_length
,
const
index_t
&
right_pad
)
:
up_length
_
{
low_length
+
right_pad
},
low_length_
{
low_length
},
right_pad_
{
right_pad
}
:
up_length
s_
{
{
low_length
+
right_pad
}
}
,
low_length_
{
low_length
},
right_pad_
{
right_pad
}
{
}
__host__
__device__
explicit
constexpr
DynamicRightPad
()
:
up_length
_
{
0
},
low_length_
{
0
},
right_pad_
{
0
}
:
up_length
s_
{
{
0
}
}
,
low_length_
{
0
},
right_pad_
{
0
}
{
}
...
...
@@ -148,10 +149,11 @@ struct DynamicRightPad
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
__host__
__device__
static
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
{
static_assert
(
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
...
...
@@ -160,7 +162,8 @@ struct DynamicRightPad
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
...
...
@@ -216,7 +219,7 @@ struct DynamicEmbed
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
NDimUp
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
up_lengths_
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -276,13 +279,13 @@ struct DynamicMerge
const
LowerIndex
low_lengths_
;
const
LowerIndex
low_lengths_scan_
;
const
i
ndex
_t
up_length_
;
const
UpperI
ndex
up_length
s
_
;
__host__
__device__
explicit
constexpr
DynamicMerge
(
const
LowerIndex
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
reverse_exclusive_scan_on_array
(
low_lengths
,
math
::
multiplies
<
index_t
>
{},
index_t
{
1
})},
up_length
_
{
reduce_on_array
(
low_lengths
,
math
::
multiplies
<
index_t
>
(),
1
)}
up_length
s_
{
{
reduce_on_array
(
low_lengths
,
math
::
multiplies
<
index_t
>
(),
index_t
{
1
}
)}
}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
...
...
@@ -290,7 +293,7 @@ struct DynamicMerge
__host__
__device__
explicit
constexpr
DynamicMerge
()
:
low_lengths_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
low_lengths_scan_
{
make_zero_array
<
index_t
,
NDimLow
>
()},
up_length
_
{
0
}
up_length
s_
{
{
0
}
}
{
}
...
...
@@ -298,7 +301,7 @@ struct DynamicMerge
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{
up_length_
}
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_length
s
_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -444,7 +447,7 @@ struct DynamicUnMerge
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
NDimUp
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
up_lengths_
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
@@ -500,7 +503,7 @@ struct DynamicFreeze
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
0
;
}
__host__
__device__
constexpr
auto
GetUpperLengths
()
const
{
return
UpperIndex
{};
}
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
UpperIndex
{};
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
View file @
f3099df8
...
...
@@ -150,7 +150,7 @@ struct DynamicTensorDescriptor_v2
index_t
element_space_size
)
{
// zero initialization
HiddenIndex
hidden_lengths
{
0
};
HiddenIndex
hidden_lengths
{
{
0
}
};
// this is the orignal tensor element space size
hidden_lengths
(
0
)
=
element_space_size
;
...
...
@@ -319,7 +319,8 @@ struct lambda_get_up_dim_num
template
<
typename
I
>
__host__
__device__
constexpr
auto
operator
()(
I
)
const
{
return
Number
<
NewTransforms
{}.
At
(
I
{}).
GetNumOfUpperDimension
()
>
{};
using
Tran
=
remove_reference_t
<
decltype
(
NewTransforms
{}.
At
(
I
{}))
>
;
return
Number
<
Tran
::
GetNumOfUpperDimension
()
>
{};
}
};
...
...
@@ -488,7 +489,7 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten
using
HiddenIndex
=
MultiIndex
<
ndim_hidden
>
;
// this is what needs to be calculated
auto
idx_diff_hidden
=
HiddenIndex
{
0
};
auto
idx_diff_hidden
=
HiddenIndex
{
{
0
}
};
// initialize visible index diff
// idx_diff_hidden_pick_visible contains reference to idx_diff_hidden
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
f3099df8
...
...
@@ -364,7 +364,7 @@ struct UnMerge
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
{
0
};
LowerIndex
idx_low
{
{
0
}
};
constexpr
auto
pseudo_up_strides
=
reverse_inclusive_scan_sequence
(
...
...
@@ -425,7 +425,7 @@ struct Embed
__host__
__device__
static
constexpr
auto
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
=
{
Coefficients
{}[
nDimUp
]};
LowerIndex
idx_low
{
{
Coefficients
{}[
nDimUp
]}
}
;
for
(
index_t
i
=
0
;
i
<
nDimUp
;
++
i
)
{
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
f3099df8
...
...
@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
{
idx_1d
}
}
);
}
};
...
...
driver/CMakeLists.txt
View file @
f3099df8
...
...
@@ -17,6 +17,7 @@ install(TARGETS host LIBRARY DESTINATION lib)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
set
(
CONV_SOURCE src/conv_driver.cpp
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp
)
set
(
TRY_SOURCE src/try.cpp
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
set
(
CONV_SOURCE src/conv_driver.cu
)
set
(
CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu
)
...
...
@@ -24,6 +25,8 @@ endif()
add_executable
(
conv_driver
${
CONV_SOURCE
}
)
add_executable
(
conv_bwd_data_driver
${
CONV_BWD_DATA_SOURCE
}
)
add_executable
(
try
${
TRY_SOURCE
}
)
target_link_libraries
(
conv_driver PRIVATE host
)
target_link_libraries
(
conv_bwd_data_driver PRIVATE host
)
target_link_libraries
(
try PRIVATE host
)
driver/include/device_dummy_dynamic_transform.hpp
View file @
f3099df8
...
...
@@ -12,7 +12,7 @@ template <class T,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform
(
InDesc
,
void
device_dummy_dynamic_transform
_1
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
...
...
@@ -52,11 +52,11 @@ void device_dummy_dynamic_transform(InDesc,
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
0
});
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
{
0
,
0
}
}
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
{
1
,
0
}
}
;
printf
(
"iter %d
\n
"
,
iter
);
...
...
@@ -147,7 +147,7 @@ template <class T,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_dummy_dynamic_transform_
v
2
(
InDesc
,
void
device_dummy_dynamic_transform_2
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
...
...
@@ -187,10 +187,10 @@ void device_dummy_dynamic_transform_v2(InDesc,
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
0
});
make_dynamic_tensor_coordinate_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
{
0
,
0
}
}
);
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
1
});
const
auto
in_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step_v2
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
{
0
,
1
}
}
);
for
(
index_t
iter
=
0
;
iter
<
20
;
++
iter
)
{
...
...
driver/src/conv_driver.cpp
View file @
f3099df8
...
...
@@ -585,7 +585,7 @@ int main(int argc, char* argv[])
RightPads
{},
nrepeat
);
#elif 0
device_dummy_dynamic_transform
(
in_nchw_desc
,
device_dummy_dynamic_transform
_1
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
...
...
@@ -596,7 +596,7 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{},
nrepeat
);
#elif
0
#elif
1
device_dummy_dynamic_transform_2
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
driver/src/try.cpp
0 → 100644
View file @
f3099df8
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
auto
idx1
=
std
::
array
<
index_t
,
2
>
{{
1
,
0
}};
auto
idx2
=
Array
<
index_t
,
2
>
{{
1
,
0
}};
auto
idx3
=
MultiIndex
<
2
>
{{
1
,
0
}};
auto
idx0
=
MultiIndex
<
2
>
{{
1
,
0
}};
print_array
(
"idx2"
,
idx2
);
print_array
(
"idx3"
,
idx2
);
}
script/cmake-rocm3.
5
.sh
→
script/cmake-rocm3.
7
.sh
View file @
f3099df8
...
...
@@ -8,14 +8,17 @@ MY_PROJECT_INSTALL=../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Debug
\
-D
DEVICE_BACKEND
=
"AMD"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -mllvm
--amdgpu-enable-global-sgpr-addr -mllvm
--amdgpu-spill-vgpr-to-agpr=0"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0
-save-temps=
$CWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment