Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
874a78f9
Commit
874a78f9
authored
Feb 09, 2024
by
Jun Liu
Browse files
Merge branch 'amd-develop' into amd-master
parents
6368be50
2fd6c6d4
Changes
89
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
226 additions
and
119 deletions
+226
-119
test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
...tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
+2
-0
test/gemm_split_k/test_gemm_splitk_util.hpp
test/gemm_split_k/test_gemm_splitk_util.hpp
+16
-3
test/grouped_gemm/test_grouped_gemm_util.hpp
test/grouped_gemm/test_grouped_gemm_util.hpp
+16
-3
test/normalization_bwd_gamma_beta/CMakeLists.txt
test/normalization_bwd_gamma_beta/CMakeLists.txt
+13
-0
test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp
...ion_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp
+51
-0
test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
...n_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
+48
-0
test/wrapper/test_copy.cpp
test/wrapper/test_copy.cpp
+40
-39
test/wrapper/test_partition.cpp
test/wrapper/test_partition.cpp
+32
-59
test/wrapper/test_tensor.cpp
test/wrapper/test_tensor.cpp
+8
-15
No files found.
test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
View file @
874a78f9
...
...
@@ -135,6 +135,8 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
return
col2img
.
IsSupportedArgument
(
argument
);
}
throw
std
::
runtime_error
(
"Conv_tensor_rearrange: problem with tensor rearrange operator. "
);
return
1
;
}
};
...
...
test/gemm_split_k/test_gemm_splitk_util.hpp
View file @
874a78f9
...
...
@@ -60,7 +60,9 @@ class TestGemmSplitK : public testing::Test
const
int
StrideA
,
const
int
StrideB
,
const
int
StrideC
,
int
kbatch
=
1
)
int
kbatch
=
1
,
int
n_warmup
=
1
,
int
n_iter
=
10
)
{
bool
pass
=
ck
::
profiler
::
profile_gemm_splitk_impl
<
ADataType
,
BDataType
,
...
...
@@ -68,8 +70,19 @@ class TestGemmSplitK : public testing::Test
CDataType
,
ALayout
,
BLayout
,
CLayout
>
(
verify_
,
init_method_
,
log_
,
bench_
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
kbatch
);
CLayout
>
(
verify_
,
init_method_
,
log_
,
bench_
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
kbatch
,
n_warmup
,
n_iter
);
EXPECT_TRUE
(
pass
);
}
};
...
...
test/grouped_gemm/test_grouped_gemm_util.hpp
View file @
874a78f9
...
...
@@ -63,7 +63,9 @@ class TestGroupedGemm : public testing::TestWithParam<int>
const
std
::
vector
<
int
>&
StrideAs
,
const
std
::
vector
<
int
>&
StrideBs
,
const
std
::
vector
<
int
>&
StrideCs
,
int
kbatch
=
1
)
int
kbatch
=
1
,
int
n_warmup
=
1
,
int
n_iter
=
10
)
{
bool
pass
=
ck
::
profiler
::
profile_grouped_gemm_impl
<
ADataType
,
BDataType
,
...
...
@@ -71,8 +73,19 @@ class TestGroupedGemm : public testing::TestWithParam<int>
float
,
ALayout
,
BLayout
,
ELayout
>
(
verify_
,
init_method_
,
log_
,
bench_
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
,
kbatch
);
ELayout
>
(
verify_
,
init_method_
,
log_
,
bench_
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
,
kbatch
,
n_warmup
,
n_iter
);
EXPECT_TRUE
(
pass
);
}
};
...
...
test/normalization_bwd_gamma_beta/CMakeLists.txt
0 → 100644
View file @
874a78f9
add_custom_target
(
test_normalization_bwd_gamma_beta
)
add_gtest_executable
(
test_layernorm2d_bwd_gamma_beta_fp32 test_layernorm2d_bwd_gamma_beta_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm2d_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance
)
add_dependencies
(
test_normalization_bwd_gamma_beta test_layernorm2d_bwd_gamma_beta_fp32
)
endif
()
add_gtest_executable
(
test_groupnorm_bwd_gamma_beta_fp32 test_groupnorm_bwd_gamma_beta_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_groupnorm_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance
)
add_dependencies
(
test_normalization_bwd_gamma_beta test_groupnorm_bwd_gamma_beta_fp32
)
endif
()
test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp
0 → 100644
View file @
874a78f9
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestgroupnormBwdGammaBeta
:
public
::
testing
::
Test
{
protected:
using
DYDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
XDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
MeanInvStdDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
ComputeDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
DGammaDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
DBetaDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// Bwd data: [N, H, W, G, C], reduce H, W, C
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{{
1
,
1
,
1
,
1
,
1
},
{
1
,
2
,
3
,
4
,
5
},
{
256
,
9
,
9
,
9
,
9
},
{
1
,
64
,
64
,
32
,
10
},
{
1
,
32
,
32
,
32
,
20
},
{
1
,
16
,
16
,
32
,
40
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_groupnorm_bwd_gamma_beta_impl
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
ComputeDataType
,
DGammaDataType
,
DBetaDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>>
;
TYPED_TEST_SUITE
(
TestgroupnormBwdGammaBeta
,
KernelTypes
);
TYPED_TEST
(
TestgroupnormBwdGammaBeta
,
Test_FP32
)
{
this
->
Run
();
}
test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
0 → 100644
View file @
874a78f9
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestLayernorm2dBwdGammaBeta
:
public
::
testing
::
Test
{
protected:
using
DYDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
XDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
MeanInvStdDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
ComputeDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
DGammaDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
DBetaDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// Bwd data: [N, D], reduce D
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{
{
4
,
256
},
{
8
,
511
},
{
9
,
1032
},
{
4
,
2048
},
{
1
,
8192
},
{
4000
,
2000
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_layernorm_bwd_gamma_beta_impl
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
ComputeDataType
,
DGammaDataType
,
DBetaDataType
,
2
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>>
;
TYPED_TEST_SUITE
(
TestLayernorm2dBwdGammaBeta
,
KernelTypes
);
TYPED_TEST
(
TestLayernorm2dBwdGammaBeta
,
Test_FP32
)
{
this
->
Run
();
}
test/wrapper/test_copy.cpp
View file @
874a78f9
...
...
@@ -21,49 +21,59 @@ template <typename InputTensor,
typename
OutputTensor
,
typename
BlockShape
,
typename
ThreadLayoutShape
,
typename
LocalTileSteps
,
typename
LocalPartitionSteps
>
bool
UseOptimizedCopy
>
__global__
void
TestCopyDevice
(
const
InputTensor
input_tensor
,
OutputTensor
output_tensor
,
const
BlockShape
tile_shape
,
const
ThreadLayoutShape
thread_layout
,
const
LocalTileSteps
block_steps
,
const
LocalPartitionSteps
thread_steps
)
const
ThreadLayoutShape
thread_layout
)
{
__shared__
ck
::
index_t
p_shared
[
ck
::
wrapper
::
size
(
tile_shape
)];
auto
tensor_lds
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
const
auto
tensor_lds
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
p_shared
,
ck
::
wrapper
::
make_layout
(
tile_shape
));
const
auto
block_idx
s
=
ck
::
make_tuple
(
ck
::
make_tuple
(
0
,
0
),
blockIdx
.
x
);
const
auto
block_idx
=
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
);
// Get local tiles for global memory
const
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idxs
,
block_steps
);
const
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idx
);
const
auto
output_local_tile
=
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
s
,
block_steps
);
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
);
// Get partition per thread
const
auto
input_local_partition
=
ck
::
wrapper
::
make_local_partition
(
input_local_tile
,
thread_layout
,
threadIdx
.
x
,
thread_steps
);
const
auto
input_local_partition
=
ck
::
wrapper
::
make_local_partition
(
input_local_tile
,
thread_layout
,
threadIdx
.
x
);
auto
lds_local_partition
=
ck
::
wrapper
::
make_local_partition
(
tensor_lds
,
thread_layout
,
threadIdx
.
x
,
thread_steps
);
auto
output_local_partition
=
ck
::
wrapper
::
make_local_partition
(
output_local_tile
,
thread_layout
,
threadIdx
.
x
,
thread_steps
);
ck
::
wrapper
::
make_local_partition
(
tensor_lds
,
thread_layout
,
threadIdx
.
x
);
auto
output_local_partition
=
ck
::
wrapper
::
make_local_partition
(
output_local_tile
,
thread_layout
,
threadIdx
.
x
);
// Allocate VGPR
constexpr
ck
::
index_t
scalar_per_vector
=
1
;
constexpr
ck
::
index_t
vgpr_size
=
ck
::
wrapper
::
size
(
lds_local_partition
);
auto
tensor_vgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
vgpr_size
,
scalar_per_vector
,
ck
::
index_t
>
();
auto
tensor_vgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
ck
::
index_t
>
(
layout
(
lds_local_partition
));
// Perform copy
ck
::
wrapper
::
copy
(
input_local_partition
,
lds_local_partition
);
ck
::
wrapper
::
copy
(
lds_local_partition
,
tensor_vgpr
);
ck
::
wrapper
::
copy
(
tensor_vgpr
,
output_local_partition
);
if
constexpr
(
UseOptimizedCopy
)
{
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
0
>>
;
constexpr
ck
::
index_t
vector_dim
=
0
;
constexpr
ck
::
index_t
scalar_per_vector
=
2
;
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
input_local_partition
,
lds_local_partition
);
// TODO: Enable optimized copy for static buffers
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
lds_local_partition
,
tensor_vgpr
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
tensor_vgpr
,
output_local_partition
);
}
else
{
ck
::
wrapper
::
copy
(
input_local_partition
,
lds_local_partition
);
ck
::
wrapper
::
copy
(
lds_local_partition
,
tensor_vgpr
);
ck
::
wrapper
::
copy
(
tensor_vgpr
,
output_local_partition
);
}
}
template
<
bool
UseOptimizedCopy
>
void
PerformCopyGlobalToGlobalViaLDS
()
{
const
auto
shape
=
...
...
@@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS()
auto
output_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
ck
::
index_t
*>
(
out_buf
.
GetDeviceBuffer
()),
layout
);
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
32
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
64
>
{});
const
auto
thread_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
2
>
{});
const
auto
block_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
64
>
{});
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
32
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
64
>
{});
const
ck
::
index_t
grid_size
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
(
input_tensor_global
),
ck
::
wrapper
::
size
(
tile_shape
));
...
...
@@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS()
decltype
(
output_tensor_global
),
decltype
(
tile_shape
),
decltype
(
thread_layout
),
decltype
(
block_steps
),
decltype
(
thread_steps
)
>
;
UseOptimizedCopy
>
;
launch_and_time_kernel
(
StreamConfig
{},
kernel
,
dim3
(
grid_size
),
...
...
@@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS()
input_tensor_global
,
output_tensor_global
,
tile_shape
,
thread_layout
,
block_steps
,
thread_steps
);
thread_layout
);
// Verify results
std
::
vector
<
ck
::
index_t
>
output_data
(
ck
::
wrapper
::
size
(
shape
));
...
...
@@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS()
EXPECT_TRUE
(
ck
::
utils
::
check_err
(
output_data
,
input_data
));
}
TEST
(
TestCopy
,
CopyGlobalToGlobalViaLDS
)
{
PerformCopyGlobalToGlobalViaLDS
();
}
TEST
(
TestCopyGlobalToGlobalViaLDS
,
GenericCopy
)
{
PerformCopyGlobalToGlobalViaLDS
<
false
>
();
}
TEST
(
TestCopyGlobalToGlobalViaLDS
,
OptimizedCopy
)
{
PerformCopyGlobalToGlobalViaLDS
<
true
>
();
}
test/wrapper/test_partition.cpp
View file @
874a78f9
...
...
@@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition)
const
auto
tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Generic
>
(
data
.
data
(),
layout
);
const
auto
thread_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
1
>
{});
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
1
>
{});
for
(
ck
::
index_t
thread_id
=
0
;
thread_id
<
ck
::
wrapper
::
size
(
thread_layout
);
thread_id
++
)
{
const
auto
raked_partition
=
ck
::
wrapper
::
make_local_partition
(
tensor
,
thread_layout
,
thread_id
);
const
auto
expected_partition_size
=
ck
::
wrapper
::
size
(
tensor
)
/
ck
::
wrapper
::
size
(
thread_layout
);
EXPECT_EQ
(
ck
::
wrapper
::
size
(
raked_partition
),
expected_partition_size
);
EXPECT_EQ
(
raked_partition
(
0
),
thread_id
);
}
const
auto
thread_steps
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
1
>
{});
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
1
>
{});
for
(
ck
::
index_t
thread_id
=
0
;
thread_id
<
ck
::
wrapper
::
size
(
thread_layout
);
thread_id
++
)
{
const
auto
packed_partition
=
ck
::
wrapper
::
make_local_partition
(
tensor
,
thread_layout
,
thread_id
,
thread_steps
);
ck
::
wrapper
::
make_local_partition
(
tensor
,
thread_layout
,
thread_id
);
const
auto
expected_partition_size
=
ck
::
wrapper
::
size
(
tensor
)
/
ck
::
wrapper
::
size
(
thread_layout
);
const
auto
expected_partition_first_val
=
thread_id
*
ck
::
wrapper
::
size
<
0
,
0
>
(
thread_steps
);
const
auto
expected_partition_first_val
=
thread_id
*
ck
::
wrapper
::
size
<
0
>
(
thread_steps
);
const
auto
expected_partition_second_val
=
expected_partition_first_val
+
1
;
EXPECT_EQ
(
ck
::
wrapper
::
size
(
packed_partition
),
expected_partition_size
);
EXPECT_EQ
(
packed_partition
(
0
),
expected_partition_first_val
);
EXPECT_EQ
(
packed_partition
(
1
),
expected_partition_second_val
);
}
}
TEST
(
TestPartition
,
LocalTile
)
{
const
auto
shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
4
>
{}),
ck
::
Number
<
4
>
{});
const
auto
strides
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
16
>
{}),
ck
::
Number
<
64
>
{});
const
auto
layout
=
ck
::
wrapper
::
make_layout
(
shape
,
strides
);
const
auto
shape
=
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
4
>
{},
ck
::
Number
<
4
>
{});
const
auto
strides
=
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
16
>
{},
ck
::
Number
<
64
>
{});
const
auto
layout
=
ck
::
wrapper
::
make_layout
(
shape
,
strides
);
std
::
vector
<
ck
::
index_t
>
data
(
ck
::
wrapper
::
size
(
layout
));
std
::
iota
(
data
.
begin
(),
data
.
end
(),
0
);
...
...
@@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile)
const
auto
tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Generic
>
(
data
.
data
(),
layout
);
const
auto
block_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{});
const
auto
block_shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{});
const
auto
block_layout
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{});
std
::
vector
<
ck
::
Tuple
<
ck
::
Tuple
<
ck
::
index_t
,
ck
::
index_t
>
,
ck
::
index_t
>>
block_idxs
;
for
(
ck
::
index_t
x
=
0
;
x
<
ck
::
wrapper
::
size
<
0
,
0
>
(
block_layout
);
x
++
)
{
for
(
ck
::
index_t
y
=
0
;
y
<
ck
::
wrapper
::
size
<
0
,
1
>
(
block_layout
);
y
++
)
{
for
(
ck
::
index_t
z
=
0
;
z
<
ck
::
wrapper
::
size
<
1
>
(
block_layout
);
z
++
)
{
block_idxs
.
emplace_back
(
ck
::
make_tuple
(
x
,
y
),
z
);
}
}
}
for
(
const
auto
&
block_idx
:
block_idxs
)
{
const
auto
raked_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
,
block_shape
,
block_idx
);
const
auto
expected_tile_size
=
ck
::
wrapper
::
size
(
block_shape
);
EXPECT_EQ
(
ck
::
wrapper
::
size
(
raked_tile
),
expected_tile_size
);
EXPECT_EQ
(
raked_tile
(
0
),
layout
(
block_idx
));
}
const
auto
block_shape
=
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{});
const
auto
num_blocks
=
ck
::
make_tuple
(
ck
::
wrapper
::
size
<
0
>
(
shape
)
/
ck
::
wrapper
::
size
<
0
>
(
block_shape
),
ck
::
wrapper
::
size
<
1
>
(
shape
)
/
ck
::
wrapper
::
size
<
1
>
(
block_shape
),
ck
::
wrapper
::
size
<
2
>
(
shape
)
/
ck
::
wrapper
::
size
<
2
>
(
block_shape
));
std
::
vector
<
ck
::
index_t
>
block_idxs
(
ck
::
wrapper
::
size
(
num_blocks
));
std
::
iota
(
block_idxs
.
begin
(),
block_idxs
.
end
(),
0
);
for
(
const
auto
&
block_idx
:
block_idxs
)
for
(
auto
block_idx
:
block_idxs
)
{
const
auto
packed_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
,
block_shape
,
block_idx
,
block_steps
);
const
auto
packed_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
,
block_shape
,
block_idx
);
const
auto
expected_tile_size
=
ck
::
wrapper
::
size
(
block_shape
);
const
auto
expected_tile_first_val
=
ck
::
wrapper
::
size
<
0
,
0
>
(
block_idx
)
*
ck
::
wrapper
::
size
<
0
,
0
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
0
,
0
>
(
strides
)
+
ck
::
wrapper
::
size
<
0
,
1
>
(
block_idx
)
*
ck
::
wrapper
::
size
<
0
,
1
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
0
,
1
>
(
strides
)
+
ck
::
wrapper
::
size
<
1
>
(
block_idx
)
*
ck
::
wrapper
::
size
<
1
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
1
>
(
strides
);
auto
expected_tile_first_val
=
(
block_idx
%
ck
::
wrapper
::
size
<
2
>
(
num_blocks
))
*
ck
::
wrapper
::
size
<
2
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
2
>
(
strides
);
block_idx
/=
ck
::
wrapper
::
size
<
2
>
(
num_blocks
);
expected_tile_first_val
+=
(
block_idx
%
ck
::
wrapper
::
size
<
1
>
(
num_blocks
))
*
ck
::
wrapper
::
size
<
1
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
1
>
(
strides
);
block_idx
/=
ck
::
wrapper
::
size
<
1
>
(
num_blocks
);
expected_tile_first_val
+=
(
block_idx
%
ck
::
wrapper
::
size
<
0
>
(
num_blocks
))
*
ck
::
wrapper
::
size
<
0
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
0
>
(
strides
);
const
auto
expected_tile_second_val
=
expected_tile_first_val
+
1
;
EXPECT_EQ
(
ck
::
wrapper
::
size
(
packed_tile
),
expected_tile_size
);
EXPECT_EQ
(
packed_tile
(
0
),
expected_tile_first_val
);
EXPECT_EQ
(
packed_tile
(
1
),
expected_tile_second_val
);
}
}
test/wrapper/test_tensor.cpp
View file @
874a78f9
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
...
...
@@ -100,31 +100,26 @@ TEST(TestTensor, ReadWriteHostMemory)
__global__
void
TestTensorReadWriteDevice
(
void
*
data
,
void
*
success
)
{
constexpr
ck
::
index_t
nelems
=
8
;
constexpr
ck
::
index_t
scalar_per_vector
=
1
;
constexpr
ck
::
index_t
nelems
=
8
;
__shared__
ck
::
index_t
p_shared
[
nelems
];
ck
::
index_t
*
casted_data_ptr
=
static_cast
<
ck
::
index_t
*>
(
data
);
bool
*
casted_success_ptr
=
static_cast
<
bool
*>
(
success
);
const
auto
layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
make_tuple
(
2
,
2
),
2
));
constexpr
auto
vgpr_layout
=
ck
::
wrapper
::
make_layout
(
make_tuple
(
ck
::
Number
<
nelems
>
{}),
make_tuple
(
ck
::
Number
<
1
>
{}));
auto
tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
casted_data_ptr
,
layout
);
auto
tensor_lds
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
p_shared
,
layout
);
auto
tensor_vgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
nelems
,
scalar_per_vector
,
ck
::
index_t
>
();
auto
tensor_sgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Sgpr
,
nelems
,
scalar_per_vector
,
ck
::
index_t
>
();
auto
tensor_lds
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
p_shared
,
layout
);
auto
tensor_vgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
ck
::
index_t
>
(
vgpr_layout
);
InitTensor
(
tensor_global
);
InitTensor
(
tensor_lds
);
StaticInitTensor
<
nelems
>
(
tensor_vgpr
);
StaticInitTensor
<
nelems
>
(
tensor_sgpr
);
*
casted_success_ptr
=
TestTensorCheck1d
(
tensor_global
);
*
casted_success_ptr
&=
TestTensorCheck3d
(
tensor_global
);
...
...
@@ -133,8 +128,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
*
casted_success_ptr
&=
TestTensorCheck3d
(
tensor_lds
);
*
casted_success_ptr
&=
StaticTestTensorCheck1d
<
nelems
>
(
tensor_vgpr
);
*
casted_success_ptr
&=
StaticTestTensorCheck1d
<
nelems
>
(
tensor_sgpr
);
}
TEST
(
TestTensor
,
ReadWriteGlobalLdsRegistersMemory
)
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment