Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4131b712
Commit
4131b712
authored
Sep 27, 2023
by
Umang Yadav
Browse files
additional changes to make it work
parent
213196c0
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
32 additions
and
41 deletions
+32
-41
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+13
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
...or_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+0
-2
include/ck/utility/amd_wave_read_first_lane.hpp
include/ck/utility/amd_wave_read_first_lane.hpp
+7
-10
include/ck/utility/data_type.hpp
include/ck/utility/data_type.hpp
+0
-10
include/ck/utility/f8_utils.hpp
include/ck/utility/f8_utils.hpp
+2
-2
include/ck/utility/math.hpp
include/ck/utility/math.hpp
+2
-0
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+2
-1
include/ck/utility/random_gen.hpp
include/ck/utility/random_gen.hpp
+4
-3
include/ck/utility/type_convert.hpp
include/ck/utility/type_convert.hpp
+2
-2
No files found.
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
4131b712
...
...
@@ -8,8 +8,10 @@
#include "ck/utility/tuple.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#ifndef __HIPCC_RTC__
#include <limits>
#include <stdlib.h>
#endif
namespace
ck
{
...
...
@@ -777,7 +779,7 @@ struct BlockToCTileMap_GemmStreamK
uint32_t
dp_for_sk_iters
=
k_iters_per_tile
.
get
();
uint32_t
best_sk_score
=
std
::
n
umeric
_l
imits
<
int
>::
m
ax
();
// we need to find the smallest sk iters
ck
::
N
umeric
L
imits
<
int
32_t
>::
M
ax
();
// we need to find the smallest sk iters
for
(
uint32_t
tentative_sk_blocks
=
min_sk_tiles
;
tentative_sk_blocks
<
max_sk_tiles
;
tentative_sk_blocks
++
)
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
View file @
4131b712
...
...
@@ -3,8 +3,6 @@
#pragma once
#include <iostream>
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
#ifndef __HIPCC_RTC__
...
...
include/ck/utility/amd_wave_read_first_lane.hpp
View file @
4131b712
...
...
@@ -39,7 +39,7 @@ struct get_carrier<3>
{
using
value_type
=
uint32_t
;
std
::
array
<
std
::
byte
,
3
>
bytes
;
std
::
byte
bytes
[
3
]
;
static_assert
(
sizeof
(
bytes
)
<=
sizeof
(
value_type
));
// replacement of host std::copy_n()
...
...
@@ -61,15 +61,12 @@ struct get_carrier<3>
}
// method to trigger template substitution failure
__device__
carrier
(
const
carrier
&
other
)
noexcept
{
copy_n
(
other
.
bytes
.
begin
(),
bytes
.
size
(),
bytes
.
begin
());
}
__device__
carrier
(
const
carrier
&
other
)
noexcept
{
copy_n
(
&
other
.
bytes
[
0
],
3
,
&
bytes
[
0
]);
}
public:
__device__
carrier
&
operator
=
(
value_type
value
)
noexcept
{
copy_n
(
reinterpret_cast
<
const
std
::
byte
*>
(
&
value
),
bytes
.
size
()
,
bytes
.
begin
()
);
copy_n
(
reinterpret_cast
<
const
std
::
byte
*>
(
&
value
),
3
,
&
bytes
[
0
]
);
return
*
this
;
}
...
...
@@ -78,7 +75,7 @@ struct get_carrier<3>
{
std
::
byte
result
[
sizeof
(
value_type
)];
copy_n
(
bytes
.
begin
(),
bytes
.
size
()
,
result
);
copy_n
(
&
bytes
[
0
],
3
,
result
);
return
*
reinterpret_cast
<
const
value_type
*>
(
result
);
}
...
...
@@ -102,9 +99,9 @@ __device__ inline int32_t amd_wave_read_first_lane(int32_t value)
return
__builtin_amdgcn_readfirstlane
(
value
);
}
template
<
typename
Object
,
typename
=
std
::
enable_if_t
<
std
::
is_class_v
<
Object
>
&&
std
::
is_trivially_copyable
_v
<
Object
>>>
template
<
typename
Object
,
typename
=
std
::
enable_if_t
<
std
::
is_class
<
Object
>
::
value
&&
std
::
is_trivially_copyable
<
Object
>
::
value
>>
__device__
auto
amd_wave_read_first_lane
(
const
Object
&
obj
)
{
using
Size
=
unsigned
;
...
...
include/ck/utility/data_type.hpp
View file @
4131b712
...
...
@@ -1027,16 +1027,6 @@ struct NumericLimits<uint16_t>
__host__
__device__
static
constexpr
uint16_t
QuietNaN
()
{
return
0
;
}
};
template
<
>
struct
NumericLimits
<
uint8_t
>
{
__host__
__device__
static
constexpr
uint8_t
Lowest
()
noexcept
{
return
0
;
}
__host__
__device__
static
constexpr
uint8_t
Min
()
noexcept
{
return
0
;
}
__host__
__device__
static
constexpr
uint8_t
Max
()
noexcept
{
return
255U
;
}
__host__
__device__
static
constexpr
uint8_t
Infinity
()
noexcept
{
return
0
;
}
__host__
__device__
static
constexpr
uint8_t
QuietNaN
()
{
return
0
;
}
};
template
<
>
struct
NumericLimits
<
float
>
{
...
...
include/ck/utility/f8_utils.hpp
View file @
4131b712
...
...
@@ -44,7 +44,7 @@ __host__ __device__ f8_t run_cast_to_f8(T x, uint32_t rng)
constexpr
uint32_t
nan_mask
=
is_half
?
0x7C00
:
0x7F800000
;
// convert to bitwise
typedef
typename
std
::
conditional
<
std
::
is_same
<
T
,
half_t
>::
value
,
uint16_t
,
uint32_t
>::
type
typedef
typename
ck
::
conditional
<
std
::
is_same
<
T
,
half_t
>::
value
,
uint16_t
,
uint32_t
>::
type
T_bitwise
;
T_bitwise
x_bitwise
=
*
(
reinterpret_cast
<
T_bitwise
*>
(
&
x
));
...
...
@@ -180,7 +180,7 @@ __host__ __device__ T run_cast_from_f8(f8_t x)
constexpr
int
exp_low_cutoff
=
(
1
<<
(
type_exp
-
1
))
-
(
1
<<
(
f8_exp
-
1
))
+
1
-
(
negative_zero_nan
?
1
:
0
);
typename
std
::
conditional
<
std
::
is_same
<
T
,
half_t
>::
value
,
uint16_t
,
uint32_t
>::
type
retval
;
typename
ck
::
conditional
<
std
::
is_same
<
T
,
half_t
>::
value
,
uint16_t
,
uint32_t
>::
type
retval
;
if
constexpr
(
negative_zero_nan
)
{
...
...
include/ck/utility/math.hpp
View file @
4131b712
...
...
@@ -168,9 +168,11 @@ __device__ double exp<double>(double x)
return
exp
(
x
);
}
#ifndef __HIPCC_RTC__
static
inline
__host__
float
exp
(
float
x
)
{
return
::
expf
(
x
);
}
static
inline
__host__
double
exp
(
double
x
)
{
return
std
::
exp
(
x
);
}
#endif
// greatest common divisor, aka highest common factor
__host__
__device__
constexpr
index_t
gcd
(
index_t
x
,
index_t
y
)
...
...
include/ck/utility/math_v2.hpp
View file @
4131b712
...
...
@@ -13,6 +13,7 @@
namespace
ck
{
namespace
math
{
#ifndef __HIPCC_RTC__
// math functions for the host, some are implemented by calling C++ std functions
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
...
...
@@ -100,7 +101,7 @@ static inline __host__ half_t tanh(half_t x)
static
inline
__host__
float
tanh
(
float
x
)
{
return
std
::
tanh
(
x
);
};
static
inline
__host__
double
tanh
(
double
x
)
{
return
std
::
tanh
(
x
);
};
#endif
// math functions for the HIP kernel, some are implemented by calling hip builtin functions
static
inline
__device__
float
abs
(
float
x
)
{
return
::
abs
(
x
);
};
...
...
include/ck/utility/random_gen.hpp
View file @
4131b712
...
...
@@ -2,6 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ck/utility/ignore.hpp>
namespace
ck
{
...
...
@@ -43,9 +44,9 @@ template <typename T,
std
::
enable_if_t
<!
(
std
::
is_same
<
float
,
T
>{}
||
std
::
is_same
<
half_t
,
T
>
{}),
bool
>
=
false
>
__host__
__device__
uint32_t
prand_generator
(
int
id
,
T
val
,
uint32_t
seed
=
seed_t
)
{
std
::
ignore
=
id
;
std
::
ignore
=
val
;
std
::
ignore
=
seed
;
ck
::
ignore
=
id
;
ck
::
ignore
=
val
;
ck
::
ignore
=
seed
;
return
0
;
}
...
...
include/ck/utility/type_convert.hpp
View file @
4131b712
...
...
@@ -190,7 +190,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
constexpr
int
seed
=
42
;
// as thread id is not available on host, use 0 for prn generation
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
uintptr
_t
>
(
&
x
),
x
);
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
size
_t
>
(
&
x
),
x
);
return
utils
::
cast_to_f8
<
float
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
}
...
...
@@ -204,7 +204,7 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
constexpr
f8_rounding_mode
rm
=
f8_rounding_mode
::
stochastic
;
constexpr
int
seed
=
42
;
// as thread id is not available on host, use 0 for prn generation
uint32_t
rng
=
prand_generator
<
half_t
,
seed
>
(
reinterpret_cast
<
uintptr
_t
>
(
&
x
),
x
);
uint32_t
rng
=
prand_generator
<
half_t
,
seed
>
(
reinterpret_cast
<
size
_t
>
(
&
x
),
x
);
return
utils
::
cast_to_f8
<
half_t
,
negative_zero_nan
,
clip
,
(
rm
==
f8_rounding_mode
::
stochastic
)
>
(
x
,
rng
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment