Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
574fd35a
Commit
574fd35a
authored
Oct 09, 2023
by
Astha Rai
Browse files
adding client example
parent
f157518d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
430 additions
and
0 deletions
+430
-0
client_example/23_elementwise_transpose/CMakeLists.txt
client_example/23_elementwise_transpose/CMakeLists.txt
+2
-0
client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
...ple/23_elementwise_transpose/elementwise_transpose_3d.cpp
+156
-0
library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose.hpp
...sor_operation_instance/gpu/transpose/device_transpose.hpp
+207
-0
library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
...ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+65
-0
No files found.
client_example/23_elementwise_transpose/CMakeLists.txt
0 → 100644
View file @
574fd35a
add_executable
(
client_elementwise_transpose3d elementwise_transpose_3d.cpp
)
target_link_libraries
(
client_elementwise_transpose3d PRIVATE composable_kernel::device_operations
)
client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
0 → 100644
View file @
574fd35a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
()
{
bool
time_kernel
=
true
;
const
int
N
=
16
;
const
int
C
=
8
;
const
int
D
=
8
;
const
int
H
=
8
;
const
int
W
=
8
;
std
::
vector
<
std
::
size_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
std
::
size_t
>
nchwd
=
{
N
,
C
,
H
,
W
,
D
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
nchwd
);
auto
size
=
N
*
C
*
D
*
H
*
W
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
{
N
,
C
,
H
,
W
,
D
};
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
C
*
D
*
H
*
W
,
D
*
H
*
W
,
1
,
D
*
H
,
D
};
std
::
array
<
ck
::
index_t
,
5
>
b_strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
SimpleDeviceMem
a_dev_buf
(
sizeof
(
ADataType
)
*
size
);
SimpleDeviceMem
b_dev_buf
(
sizeof
(
BDataType
)
*
size
);
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise3dImpl
<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
PassThrough
,
2
,
2
,
1
,
8
,
8
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
])
+
sizeof
(
BDataType
)
*
(
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
]);
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
\ No newline at end of file
library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose.hpp
0 → 100644
View file @
574fd35a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
tensor_layout
::
convolution
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
device_transpose_f16_instances
=
std
::
tuple
<
// clang-format off FOR 16, 32, 16, 32, 16
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
8
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
8
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
8
,
8
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
8
,
8
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
1
,
1
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
1
,
1
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
4
,
4
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
2
,
2
,
1
,
8
,
4
,
4
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
// clang-format on
>
;
using
device_transpose_f32_instances
=
std
::
tuple
<
// clang-format off // for 16, 8, 16, 32, 8 -> test with instances for fp16
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
4
,
4
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
4
,
4
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
4
,
4
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
8
,
4
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
8
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
8
,
8
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
8
,
8
,
ck
::
Sequence
<
4
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
8
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
4
>>
,
DeviceElementwise3dImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
2
,
2
,
1
,
4
,
4
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
8
>>
,
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
\ No newline at end of file
library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
0 → 100644
View file @
574fd35a
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_3d_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
transpose_op
;
void
add_device_transpose_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise3dImpl
<
F16
,
F16
,
NCDHW
,
3
>>>&
instances
);
void
add_device_transpose_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise3dImpl
<
F32
,
F32
,
NCDHW
,
3
>>>&
instances
);
template
<
typename
InDataTypeTuple
,
typename
OutDataTypeTuple
,
typename
ElementwiseOperation
,
index_t
NumDim
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceElementwise3dImpl
<
InDataTypeTuple
,
OutDataTypeTuple
,
ElementwiseOperation
,
NumDim
>>
{
using
DeviceOp
=
DeviceElementwise3dImpl
<
InDataTypeTuple
,
OutDataTypeTuple
,
ElementwiseOperation
,
NumDim_m
,
// choose how to set dims
NumDim_n
,
NumDim_k
,
MPerThread
,
NPerThread
,
KPerThread
,
InScalarPerVectorSeq
,
OutScalarPerVectorSeq
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_transpose_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_transpose_f16_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment