Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5736b460
Commit
5736b460
authored
Feb 08, 2023
by
fsx950223
Browse files
Merge branch 'my-attn-bwd2' into my-attn-bwd3
parents
aace9ec6
f9bb62d5
Changes
28
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
337 additions
and
0 deletions
+337
-0
include/ck/utility/thread_group.hpp
include/ck/utility/thread_group.hpp
+33
-0
library/include/ck/library/reference_tensor_operation/cpu/reference_dropout.hpp
...rary/reference_tensor_operation/cpu/reference_dropout.hpp
+102
-0
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
...vice_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+19
-0
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/host_tensor/CMakeLists.txt
test/host_tensor/CMakeLists.txt
+2
-0
test/host_tensor/test_host_tensor.cpp
test/host_tensor/test_host_tensor.cpp
+106
-0
test/softmax/CMakeLists.txt
test/softmax/CMakeLists.txt
+3
-0
test/softmax/test_softmax_host_ref.cpp
test/softmax/test_softmax_host_ref.cpp
+71
-0
No files found.
include/ck/utility/thread_group.hpp
View file @
5736b460
...
@@ -19,4 +19,37 @@ struct ThisThreadBlock
...
@@ -19,4 +19,37 @@ struct ThisThreadBlock
__device__
static
index_t
GetThreadId
()
{
return
get_thread_local_1d_id
();
}
__device__
static
index_t
GetThreadId
()
{
return
get_thread_local_1d_id
();
}
};
};
template
<
index_t
ThreadPerBlock
>
struct
SubThreadBlock
{
static
constexpr
index_t
kNumThread_
=
ThreadPerBlock
;
__device__
SubThreadBlock
(
int
mwave
,
int
nwave
)
:
mwave_
(
mwave
),
nwave_
(
nwave
)
{}
__device__
static
constexpr
index_t
GetNumOfThread
()
{
return
kNumThread_
;
}
template
<
typename
TupleArg1
,
typename
TupleArg2
>
__device__
constexpr
bool
IsBelong
(
const
TupleArg1
&
mwave_range
,
const
TupleArg2
&
nwave_range
)
{
// wave_range[I0] inclusive, wave_range[I1] exclusive
if
(
mwave_
<
mwave_range
[
I0
])
return
false
;
else
if
(
mwave_
>=
mwave_range
[
I1
])
return
false
;
else
if
(
nwave_
<
nwave_range
[
I0
])
return
false
;
else
if
(
nwave_
>=
nwave_range
[
I1
])
return
false
;
else
return
true
;
}
__device__
static
index_t
GetThreadId
()
{
return
get_thread_local_1d_id
();
}
private:
index_t
mwave_
,
nwave_
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
};
}
// namespace ck
}
// namespace ck
library/include/ck/library/reference_tensor_operation/cpu/reference_dropout.hpp
0 → 100644
View file @
5736b460
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include <vector>
#include <algorithm>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
host
{
template
<
typename
RefDataType
,
typename
InDataType
,
typename
OutDataType
>
struct
ReferenceDropout
:
public
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
device
::
BaseArgument
{
Argument
(
const
Tensor
<
RefDataType
>&
ref
,
const
Tensor
<
InDataType
>&
in
,
Tensor
<
OutDataType
>&
out
,
RefDataType
p_dropout_in_16bits
,
float
rp_dropout
)
:
ref_
(
ref
),
in_
(
in
),
out_
(
out
),
p_dropout_in_16bits_
(
p_dropout_in_16bits
),
rp_dropout_
(
ck
::
type_convert
<
OutDataType
>
(
rp_dropout
))
{
}
const
Tensor
<
RefDataType
>&
ref_
;
const
Tensor
<
InDataType
>&
in_
;
Tensor
<
OutDataType
>&
out_
;
RefDataType
p_dropout_in_16bits_
;
OutDataType
rp_dropout_
;
};
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
{
float
Run
(
const
Argument
&
arg
)
{
arg
.
out_
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
arg
.
ref_
(
idx
)
<
arg
.
p_dropout_in_16bits_
?
arg
.
in_
(
idx
)
*
arg
.
rp_dropout_
:
0
;
});
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
RefDataType
>&
ref
,
const
Tensor
<
InDataType
>&
in
,
Tensor
<
OutDataType
>&
out
,
RefDataType
p_dropout_in_16bits
,
float
rp_dropout
)
{
return
Argument
{
ref
,
in
,
out
,
p_dropout_in_16bits
,
rp_dropout
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceDropout"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace host
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
View file @
5736b460
...
@@ -11,6 +11,25 @@
...
@@ -11,6 +11,25 @@
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
/*
For fp16 M-contigous matrix of size M_K, each thread reads 4x2 tile (2 * 64bits) from the global
memory, transposes the 4x2 tile inside register, and writes into LDS in K0_M_K1 layout. This allows
us to use 128-bit LDS write instruction. This also avoids write bank conflicts because two
vertically connected 4x2 tiles is a contiguous chunk of memory if modeled as K0_M_K1 layout where
K1=2.
<- K1 -> <- K1 -> <- K1 ->
_________ _________ _________
| | 0 | 4 | transpose | 0 - 1 | to LDS | 0 - 1 |
| | 1 | 5 | ---> | 2 - 3 | ----> | 2 - 3 |
| | 2 | 6 | | 4 - 5 | | 4 - 5 |
M | | 3 | 7 | | 6 - 7 | | 6 - 7 |
| --------- --------- ---------
| | ... | | ... | | ... |
v --------- --------- ---------
VMEM VGPR LDS
*/
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
...
...
test/CMakeLists.txt
View file @
5736b460
...
@@ -58,3 +58,4 @@ add_subdirectory(batchnorm)
...
@@ -58,3 +58,4 @@ add_subdirectory(batchnorm)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_subdirectory
(
wmma_op
)
add_subdirectory
(
wmma_op
)
endif
()
endif
()
add_subdirectory
(
host_tensor
)
test/host_tensor/CMakeLists.txt
0 → 100644
View file @
5736b460
add_gtest_executable
(
test_host_tensor test_host_tensor.cpp
)
target_link_libraries
(
test_host_tensor PRIVATE utility
)
\ No newline at end of file
test/host_tensor/test_host_tensor.cpp
0 → 100644
View file @
5736b460
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/library/utility/host_tensor.hpp"
using
namespace
ck
;
TEST
(
HostTensorTranspose
,
TestBadArugment
)
{
Tensor
<
float
>
tensor
({
13
,
7
});
EXPECT_THROW
(
tensor
.
Transpose
({
0
}),
std
::
runtime_error
);
EXPECT_THROW
(
tensor
.
Transpose
({
0
,
1
,
2
}),
std
::
runtime_error
);
}
TEST
(
HostTensorTranspose
,
Test2D
)
{
std
::
vector
<
size_t
>
lengths
=
{
13
,
7
};
std
::
vector
<
size_t
>
tlengths
=
{
7
,
13
};
Tensor
<
float
>
tensor
(
lengths
);
tensor
(
0
,
0
)
=
0.
f
;
tensor
(
3
,
4
)
=
34.
f
;
EXPECT_EQ
(
tensor
.
GetLengths
(),
lengths
);
EXPECT_EQ
(
tensor
(
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
(
3
,
4
),
34.
f
);
EXPECT_EQ
(
tensor
(
4
,
3
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
().
GetLengths
(),
tlengths
);
EXPECT_EQ
(
tensor
.
Transpose
()(
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
()(
4
,
3
),
34.
f
);
EXPECT_EQ
(
tensor
.
Transpose
()(
3
,
4
),
0.
f
);
}
TEST
(
HostTensorTranspose
,
Test3D
)
{
std
::
vector
<
size_t
>
lengths
=
{
13
,
7
,
5
};
std
::
vector
<
size_t
>
tlengths
=
{
5
,
7
,
13
};
Tensor
<
float
>
tensor
(
lengths
);
tensor
(
0
,
0
,
0
)
=
0.
f
;
tensor
(
3
,
4
,
2
)
=
342.
f
;
EXPECT_EQ
(
tensor
.
GetLengths
(),
lengths
);
EXPECT_EQ
(
tensor
(
0
,
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
(
3
,
4
,
2
),
342.
f
);
EXPECT_EQ
(
tensor
(
4
,
3
,
2
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
().
GetLengths
(),
tlengths
);
EXPECT_EQ
(
tensor
.
Transpose
()(
0
,
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
()(
2
,
4
,
3
),
342.
f
);
EXPECT_EQ
(
tensor
.
Transpose
()(
2
,
3
,
4
),
0.
f
);
}
TEST
(
HostTensorTranspose
,
Test3D_021
)
{
std
::
vector
<
size_t
>
lengths
=
{
13
,
7
,
5
};
std
::
vector
<
size_t
>
tlengths
=
{
13
,
5
,
7
};
Tensor
<
float
>
tensor
(
lengths
);
tensor
(
0
,
0
,
0
)
=
0.
f
;
tensor
(
3
,
4
,
2
)
=
342.
f
;
EXPECT_EQ
(
tensor
.
GetLengths
(),
lengths
);
EXPECT_EQ
(
tensor
(
0
,
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
(
3
,
4
,
2
),
342.
f
);
EXPECT_EQ
(
tensor
(
4
,
3
,
2
),
0.
f
);
// transpose last two dimensions
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
}).
GetLengths
(),
tlengths
);
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
})(
0
,
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
})(
2
,
4
,
3
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
})(
3
,
2
,
4
),
342.
f
);
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
})(
2
,
3
,
4
),
0.
f
);
// transpose last two dimensions back again
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
}).
Transpose
({
0
,
2
,
1
}).
GetLengths
(),
lengths
);
EXPECT_EQ
(
tensor
.
Transpose
({
0
,
2
,
1
}).
Transpose
({
0
,
2
,
1
})(
3
,
4
,
2
),
342.
f
);
}
TEST
(
HostTensorTranspose
,
TestNonpacked2D
)
{
std
::
vector
<
size_t
>
lengths
=
{
13
,
7
};
std
::
vector
<
size_t
>
strides
=
{
100
,
1
};
std
::
vector
<
size_t
>
tlengths
=
{
7
,
13
};
Tensor
<
float
>
tensor
(
lengths
,
strides
);
tensor
(
0
,
0
)
=
0.
f
;
tensor
(
3
,
4
)
=
34.
f
;
EXPECT_EQ
(
tensor
.
GetLengths
(),
lengths
);
EXPECT_EQ
(
tensor
(
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
(
3
,
4
),
34.
f
);
EXPECT_EQ
(
tensor
(
4
,
3
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
().
GetLengths
(),
tlengths
);
EXPECT_EQ
(
tensor
.
Transpose
()(
0
,
0
),
0.
f
);
EXPECT_EQ
(
tensor
.
Transpose
()(
4
,
3
),
34.
f
);
EXPECT_EQ
(
tensor
.
Transpose
()(
3
,
4
),
0.
f
);
}
test/softmax/CMakeLists.txt
View file @
5736b460
...
@@ -3,9 +3,12 @@ add_custom_target(test_softmax)
...
@@ -3,9 +3,12 @@ add_custom_target(test_softmax)
add_gtest_executable
(
test_softmax_rank3 test_softmax_rank3.cpp
)
add_gtest_executable
(
test_softmax_rank3 test_softmax_rank3.cpp
)
add_gtest_executable
(
test_softmax_rank4 test_softmax_rank4.cpp
)
add_gtest_executable
(
test_softmax_rank4 test_softmax_rank4.cpp
)
add_gtest_executable
(
test_softmax_interface test_softmax_interface.cpp
)
add_gtest_executable
(
test_softmax_interface test_softmax_interface.cpp
)
add_gtest_executable
(
test_softmax_host_ref test_softmax_host_ref.cpp
)
target_link_libraries
(
test_softmax_rank3 PRIVATE utility device_softmax_instance
)
target_link_libraries
(
test_softmax_rank3 PRIVATE utility device_softmax_instance
)
target_link_libraries
(
test_softmax_rank4 PRIVATE utility device_softmax_instance
)
target_link_libraries
(
test_softmax_rank4 PRIVATE utility device_softmax_instance
)
target_link_libraries
(
test_softmax_interface PRIVATE utility device_softmax_instance
)
target_link_libraries
(
test_softmax_interface PRIVATE utility device_softmax_instance
)
target_link_libraries
(
test_softmax_host_ref PRIVATE utility
)
add_dependencies
(
test_softmax test_softmax_rank3
)
add_dependencies
(
test_softmax test_softmax_rank3
)
add_dependencies
(
test_softmax test_softmax_rank4
)
add_dependencies
(
test_softmax test_softmax_rank4
)
add_dependencies
(
test_softmax test_softmax_interface
)
add_dependencies
(
test_softmax test_softmax_interface
)
add_dependencies
(
test_softmax test_softmax_host_ref
)
test/softmax/test_softmax_host_ref.cpp
0 → 100644
View file @
5736b460
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment