Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
4396a224
Unverified
Commit
4396a224
authored
Apr 16, 2024
by
Harisankar Sadasivan
Committed by
GitHub
Apr 16, 2024
Browse files
Merge branch 'develop' into mi300_time_measurement
parents
0a27f07e
501a6b68
Changes
187
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2239 additions
and
0 deletions
+2239
-0
include/ck_tile/core/utility/unary_element_function.hpp
include/ck_tile/core/utility/unary_element_function.hpp
+67
-0
include/ck_tile/host.hpp
include/ck_tile/host.hpp
+22
-0
include/ck_tile/host/arg_parser.hpp
include/ck_tile/host/arg_parser.hpp
+184
-0
include/ck_tile/host/check_err.hpp
include/ck_tile/host/check_err.hpp
+394
-0
include/ck_tile/host/device_memory.hpp
include/ck_tile/host/device_memory.hpp
+112
-0
include/ck_tile/host/fill.hpp
include/ck_tile/host/fill.hpp
+232
-0
include/ck_tile/host/hip_check_error.hpp
include/ck_tile/host/hip_check_error.hpp
+36
-0
include/ck_tile/host/host_tensor.hpp
include/ck_tile/host/host_tensor.hpp
+523
-0
include/ck_tile/host/kernel_launch.hpp
include/ck_tile/host/kernel_launch.hpp
+166
-0
include/ck_tile/host/ranges.hpp
include/ck_tile/host/ranges.hpp
+69
-0
include/ck_tile/host/reference/reference_batched_elementwise.hpp
.../ck_tile/host/reference/reference_batched_elementwise.hpp
+64
-0
include/ck_tile/host/reference/reference_batched_gemm.hpp
include/ck_tile/host/reference/reference_batched_gemm.hpp
+50
-0
include/ck_tile/host/reference/reference_batched_masking.hpp
include/ck_tile/host/reference/reference_batched_masking.hpp
+32
-0
include/ck_tile/host/reference/reference_batched_softmax.hpp
include/ck_tile/host/reference/reference_batched_softmax.hpp
+71
-0
include/ck_tile/host/reference/reference_gemm.hpp
include/ck_tile/host/reference/reference_gemm.hpp
+50
-0
include/ck_tile/host/reference/reference_im2col.hpp
include/ck_tile/host/reference/reference_im2col.hpp
+61
-0
include/ck_tile/host/reference/reference_reduce.hpp
include/ck_tile/host/reference/reference_reduce.hpp
+32
-0
include/ck_tile/host/reference/reference_softmax.hpp
include/ck_tile/host/reference/reference_softmax.hpp
+51
-0
include/ck_tile/host/stream_config.hpp
include/ck_tile/host/stream_config.hpp
+17
-0
include/ck_tile/ops/common.hpp
include/ck_tile/ops/common.hpp
+6
-0
No files found.
include/ck_tile/core/utility/unary_element_function.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
typename
F
,
typename
...
Fs
>
struct
composes
:
private
composes
<
F
>
{
template
<
typename
FirstArg
,
typename
...
RestArgs
>
CK_TILE_HOST_DEVICE
constexpr
explicit
composes
(
FirstArg
&&
firstArg
,
RestArgs
&&
...
restArgs
)
:
composes
<
F
>
(
std
::
forward
<
FirstArg
>
(
firstArg
)),
inner_
(
std
::
forward
<
RestArgs
>
(
restArgs
)...)
{
}
template
<
typename
Arg
>
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
Arg
&&
arg
)
const
{
return
static_cast
<
const
composes
<
F
>&>
(
*
this
)(
inner_
(
std
::
forward
<
Arg
>
(
arg
)));
}
private:
composes
<
Fs
...
>
inner_
;
};
template
<
typename
F
>
struct
composes
<
F
>
{
static_assert
(
!
std
::
is_reference_v
<
F
>
);
template
<
typename
Arg
,
typename
=
std
::
enable_if_t
<
std
::
is_constructible_v
<
F
,
Arg
>
>>
CK_TILE_HOST_DEVICE
constexpr
explicit
composes
(
Arg
&&
arg
)
:
f_
(
std
::
forward
<
Arg
>
(
arg
))
{
}
template
<
typename
Arg
,
typename
=
std
::
enable_if_t
<
std
::
is_invocable_v
<
std
::
add_const_t
<
F
>
&
,
Arg
>>>
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
Arg
&&
arg
)
const
{
return
f_
(
std
::
forward
<
Arg
>
(
arg
));
}
private:
F
f_
;
};
/// FIXME: create macro to replace '__host__ __device__' and nothing more
template
<
typename
...
Ts
>
__host__
__device__
composes
(
Ts
&&
...)
->
composes
<
remove_cvref_t
<
Ts
>
...
>
;
template
<
typename
To
>
struct
saturates
{
template
<
typename
From
>
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
const
From
&
from
)
const
->
std
::
enable_if_t
<
std
::
is_arithmetic_v
<
From
>
,
From
>
{
return
clamp
(
from
,
type_convert
<
From
>
(
numeric
<
To
>::
lowest
()),
type_convert
<
From
>
(
numeric
<
To
>::
max
()));
}
};
}
// namespace ck_tile
include/ck_tile/host.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/host/arg_parser.hpp"
#include "ck_tile/host/check_err.hpp"
#include "ck_tile/host/device_memory.hpp"
#include "ck_tile/host/fill.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/host/ranges.hpp"
#include "ck_tile/host/reference/reference_batched_elementwise.hpp"
#include "ck_tile/host/reference/reference_batched_gemm.hpp"
#include "ck_tile/host/reference/reference_batched_masking.hpp"
#include "ck_tile/host/reference/reference_batched_softmax.hpp"
#include "ck_tile/host/reference/reference_gemm.hpp"
#include "ck_tile/host/reference/reference_im2col.hpp"
#include "ck_tile/host/reference/reference_reduce.hpp"
#include "ck_tile/host/reference/reference_softmax.hpp"
#include "ck_tile/host/stream_config.hpp"
include/ck_tile/host/arg_parser.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <iomanip>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <unordered_map>
#include <vector>
namespace
ck_tile
{
/*
* a host side utility, arg parser for
* -[key0]=[value0] -[key1]=[value1] ...
*/
class
ArgParser
{
public:
class
Arg
{
public:
std
::
string
name
;
std
::
string
value
;
std
::
string
help_text
;
};
ArgParser
()
{}
ArgParser
&
insert
(
const
std
::
string
&
_name
,
const
std
::
string
&
_default_value
,
const
std
::
string
&
_help_text
)
{
Arg
in
;
in
.
name
=
_name
;
in
.
value
=
_default_value
;
in
.
help_text
=
_help_text
;
if
(
input_map
.
count
(
_name
)
!=
0
)
{
printf
(
"arg:%s already exist
\n
"
,
_name
.
c_str
());
}
else
{
input_map
[
_name
]
=
in
;
keys
.
push_back
(
_name
);
}
return
*
this
;
}
void
print
()
{
printf
(
"args:
\n
"
);
for
(
auto
&
key
:
keys
)
{
auto
value
=
input_map
[
key
];
std
::
vector
<
std
::
string
>
help_text_lines
;
size_t
pos
=
0
;
for
(
size_t
next_pos
=
value
.
help_text
.
find
(
'\n'
,
pos
);
next_pos
!=
std
::
string
::
npos
;)
{
help_text_lines
.
push_back
(
std
::
string
(
value
.
help_text
.
begin
()
+
pos
,
value
.
help_text
.
begin
()
+
next_pos
++
));
pos
=
next_pos
;
next_pos
=
value
.
help_text
.
find
(
'\n'
,
pos
);
}
help_text_lines
.
push_back
(
std
::
string
(
value
.
help_text
.
begin
()
+
pos
,
value
.
help_text
.
end
()));
std
::
string
default_value
=
std
::
string
(
"(default:"
)
+
value
.
value
+
std
::
string
(
")"
);
std
::
cout
<<
std
::
setw
(
2
)
<<
std
::
setw
(
12
-
value
.
name
.
length
())
<<
"-"
<<
key
<<
std
::
setw
(
4
)
<<
" "
<<
help_text_lines
[
0
]
<<
" "
<<
default_value
<<
std
::
endl
;
for
(
auto
help_next_line
=
std
::
next
(
help_text_lines
.
begin
());
help_next_line
!=
help_text_lines
.
end
();
++
help_next_line
)
{
std
::
cout
<<
std
::
setw
(
17
)
<<
" "
<<
*
help_next_line
<<
std
::
endl
;
}
}
}
bool
parse
(
int
argc
,
char
*
argv
[],
int
start_index
=
1
)
{
if
(
argc
<
start_index
)
{
printf
(
"not enough args
\n
"
);
return
false
;
}
for
(
int
i
=
start_index
;
i
<
argc
;
i
++
)
{
char
*
cur_arg
=
argv
[
i
];
if
(
cur_arg
[
0
]
!=
'-'
)
{
printf
(
"illegal input
\n
"
);
print
();
return
false
;
}
else
{
std
::
string
text
(
cur_arg
+
1
);
if
(
text
==
"?"
)
{
print
();
return
false
;
}
auto
pos
=
text
.
find
(
'='
);
if
(
pos
==
std
::
string
::
npos
)
{
printf
(
"arg should be [key]=[value] pair, here:%s
\n
"
,
text
.
c_str
());
return
false
;
}
if
(
pos
>=
(
text
.
size
()
-
1
))
{
printf
(
"cant find value after
\"
=
\"
, here:%s
\n
"
,
text
.
c_str
());
return
false
;
}
auto
key
=
text
.
substr
(
0
,
pos
);
auto
value
=
text
.
substr
(
pos
+
1
);
if
(
input_map
.
count
(
key
)
==
0
)
{
printf
(
"no such arg:%s
\n
"
,
key
.
c_str
());
return
false
;
}
input_map
[
key
].
value
=
value
;
}
}
return
true
;
}
std
::
string
get_str
(
const
std
::
string
&
name
)
const
{
std
::
string
value
=
input_map
.
at
(
name
).
value
;
return
value
;
}
int
get_int
(
const
std
::
string
&
name
)
const
{
int
value
=
atoi
(
input_map
.
at
(
name
).
value
.
c_str
());
return
value
;
}
uint32_t
get_uint32
(
const
std
::
string
&
name
)
const
{
uint32_t
value
=
strtoul
(
input_map
.
at
(
name
).
value
.
c_str
(),
nullptr
,
10
);
return
value
;
}
uint64_t
get_uint64
(
const
std
::
string
&
name
)
const
{
uint64_t
value
=
strtoull
(
input_map
.
at
(
name
).
value
.
c_str
(),
nullptr
,
10
);
return
value
;
}
bool
get_bool
(
const
std
::
string
&
name
)
const
{
auto
v
=
input_map
.
at
(
name
).
value
;
if
(
v
.
compare
(
"t"
)
==
0
||
v
.
compare
(
"true"
)
==
0
)
return
true
;
if
(
v
.
compare
(
"f"
)
==
0
||
v
.
compare
(
"false"
)
==
0
)
return
false
;
int
value
=
atoi
(
v
.
c_str
());
return
value
==
0
?
false
:
true
;
}
float
get_float
(
const
std
::
string
&
name
)
const
{
double
value
=
atof
(
input_map
.
at
(
name
).
value
.
c_str
());
return
static_cast
<
float
>
(
value
);
}
double
get_double
(
const
std
::
string
&
name
)
const
{
double
value
=
atof
(
input_map
.
at
(
name
).
value
.
c_str
());
return
value
;
}
private:
std
::
unordered_map
<
std
::
string
,
Arg
>
input_map
;
std
::
vector
<
std
::
string
>
keys
;
};
}
// namespace ck_tile
include/ck_tile/host/check_err.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
#include "ck_tile/core.hpp"
#include "ck_tile/host/ranges.hpp"
namespace
ck_tile
{
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
{
using
size_type
=
typename
std
::
vector
<
T
>::
size_type
;
os
<<
"["
;
for
(
size_type
idx
=
0
;
idx
<
v
.
size
();
++
idx
)
{
if
(
0
<
idx
)
{
os
<<
", "
;
}
os
<<
v
[
idx
];
}
return
os
<<
"]"
;
}
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
std
::
is_floating_point_v
<
ranges
::
range_value_t
<
Range
>>
&&
!
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
half_t
>
,
bool
>::
type
CK_TILE_HOST
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-5
,
double
atol
=
3e-6
,
bool
allow_infinity_ref
=
false
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cerr
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
return
false
;
}
const
auto
is_infinity_error
=
[
=
](
auto
o
,
auto
r
)
{
const
bool
either_not_finite
=
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
);
const
bool
both_infinite_and_same
=
std
::
isinf
(
o
)
&&
std
::
isinf
(
r
)
&&
(
o
==
r
);
return
either_not_finite
&&
!
(
allow_infinity_ref
&&
both_infinite_and_same
);
};
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
double
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
const
double
o
=
*
std
::
next
(
std
::
begin
(
out
),
i
);
const
double
r
=
*
std
::
next
(
std
::
begin
(
ref
),
i
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
is_infinity_error
(
o
,
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
bf16_t
>
,
bool
>::
type
CK_TILE_HOST
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
,
bool
allow_infinity_ref
=
false
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cerr
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
return
false
;
}
const
auto
is_infinity_error
=
[
=
](
auto
o
,
auto
r
)
{
const
bool
either_not_finite
=
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
);
const
bool
both_infinite_and_same
=
std
::
isinf
(
o
)
&&
std
::
isinf
(
r
)
&&
(
o
==
r
);
return
either_not_finite
&&
!
(
allow_infinity_ref
&&
both_infinite_and_same
);
};
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
// TODO: This is a hack. We should have proper specialization for bf16_t data type.
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
const
double
o
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
out
),
i
));
const
double
r
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
ref
),
i
));
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
is_infinity_error
(
o
,
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
Range
,
typename
RefRange
>
typename
std
::
enable_if
<
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
half_t
>
,
bool
>::
type
CK_TILE_HOST
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
,
bool
allow_infinity_ref
=
false
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cerr
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
return
false
;
}
const
auto
is_infinity_error
=
[
=
](
auto
o
,
auto
r
)
{
const
bool
either_not_finite
=
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
);
const
bool
both_infinite_and_same
=
std
::
isinf
(
o
)
&&
std
::
isinf
(
r
)
&&
(
o
==
r
);
return
either_not_finite
&&
!
(
allow_infinity_ref
&&
both_infinite_and_same
);
};
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
static_cast
<
double
>
(
std
::
numeric_limits
<
ranges
::
range_value_t
<
Range
>>::
min
());
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
const
double
o
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
out
),
i
));
const
double
r
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
ref
),
i
));
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
is_infinity_error
(
o
,
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
Range
,
typename
RefRange
>
std
::
enable_if_t
<
(
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
std
::
is_integral_v
<
ranges
::
range_value_t
<
Range
>>
&&
!
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
bf16_t
>
)
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
||
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
int4_t
>
#endif
,
bool
>
CK_TILE_HOST
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
=
0
,
double
atol
=
0
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cerr
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
return
false
;
}
bool
res
{
true
};
int
err_count
=
0
;
int64_t
err
=
0
;
int64_t
max_err
=
std
::
numeric_limits
<
int64_t
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
const
int64_t
o
=
*
std
::
next
(
std
::
begin
(
out
),
i
);
const
int64_t
r
=
*
std
::
next
(
std
::
begin
(
ref
),
i
);
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
)
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cerr
<<
msg
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
Range
,
typename
RefRange
>
std
::
enable_if_t
<
(
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
fp8_t
>
),
bool
>
CK_TILE_HOST
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
unsigned
max_rounding_point_distance
=
1
,
double
atol
=
1e-1
,
bool
allow_infinity_ref
=
false
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cerr
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
return
false
;
}
const
auto
is_infinity_error
=
[
=
](
auto
o
,
auto
r
)
{
const
bool
either_not_finite
=
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
);
const
bool
both_infinite_and_same
=
std
::
isinf
(
o
)
&&
std
::
isinf
(
r
)
&&
(
o
==
r
);
return
either_not_finite
&&
!
(
allow_infinity_ref
&&
both_infinite_and_same
);
};
static
const
auto
get_rounding_point_distance
=
[](
fp8_t
o
,
fp8_t
r
)
->
unsigned
{
static
const
auto
get_sign_bit
=
[](
fp8_t
v
)
->
bool
{
return
0x80
&
bit_cast
<
uint8_t
>
(
v
);
};
if
(
get_sign_bit
(
o
)
^
get_sign_bit
(
r
))
{
return
std
::
numeric_limits
<
unsigned
>::
max
();
}
else
{
return
std
::
abs
(
bit_cast
<
int8_t
>
(
o
)
-
bit_cast
<
int8_t
>
(
r
));
}
};
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
const
fp8_t
o_fp8
=
*
std
::
next
(
std
::
begin
(
out
),
i
);
const
fp8_t
r_fp8
=
*
std
::
next
(
std
::
begin
(
ref
),
i
);
const
double
o_fp64
=
type_convert
<
float
>
(
o_fp8
);
const
double
r_fp64
=
type_convert
<
float
>
(
r_fp8
);
err
=
std
::
abs
(
o_fp64
-
r_fp64
);
if
(
!
(
less_equal
<
double
>
{}(
err
,
atol
)
||
get_rounding_point_distance
(
o_fp8
,
r_fp8
)
<=
max_rounding_point_distance
)
||
is_infinity_error
(
o_fp64
,
r_fp64
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o_fp64
<<
" != "
<<
r_fp64
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cerr
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
template
<
typename
Range
,
typename
RefRange
>
std
::
enable_if_t
<
(
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
ranges
::
range_value_t
<
RefRange
>>
&&
std
::
is_same_v
<
ranges
::
range_value_t
<
Range
>
,
bf8_t
>
),
bool
>
CK_TILE_HOST
check_err
(
const
Range
&
out
,
const
RefRange
&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
,
bool
allow_infinity_ref
=
false
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cerr
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
return
false
;
}
const
auto
is_infinity_error
=
[
=
](
auto
o
,
auto
r
)
{
const
bool
either_not_finite
=
!
std
::
isfinite
(
o
)
||
!
std
::
isfinite
(
r
);
const
bool
both_infinite_and_same
=
std
::
isinf
(
o
)
&&
std
::
isinf
(
r
)
&&
(
o
==
r
);
return
either_not_finite
&&
!
(
allow_infinity_ref
&&
both_infinite_and_same
);
};
bool
res
{
true
};
int
err_count
=
0
;
double
err
=
0
;
double
max_err
=
std
::
numeric_limits
<
float
>::
min
();
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
const
double
o
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
out
),
i
));
const
double
r
=
type_convert
<
float
>
(
*
std
::
next
(
std
::
begin
(
ref
),
i
));
err
=
std
::
abs
(
o
-
r
);
if
(
err
>
atol
+
rtol
*
std
::
abs
(
r
)
||
is_infinity_error
(
o
,
r
))
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
res
=
false
;
}
}
if
(
!
res
)
{
std
::
cerr
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
return
res
;
}
}
// namespace ck_tile
include/ck_tile/host/device_memory.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include <stdint.h>
#include <stdexcept>
#include "ck_tile/host/hip_check_error.hpp"
namespace
ck_tile
{
template
<
typename
T
>
__global__
void
set_buffer_value
(
T
*
p
,
T
x
,
uint64_t
buffer_element_size
)
{
for
(
uint64_t
i
=
threadIdx
.
x
;
i
<
buffer_element_size
;
i
+=
blockDim
.
x
)
{
p
[
i
]
=
x
;
}
}
/**
* @brief Container for storing data in GPU device memory
*
*/
struct
DeviceMem
{
DeviceMem
()
:
mpDeviceBuf
(
nullptr
),
mMemSize
(
0
)
{}
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
HIP_CHECK_ERROR
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
Realloc
(
std
::
size_t
mem_size
)
{
if
(
mpDeviceBuf
)
{
HIP_CHECK_ERROR
(
hipFree
(
mpDeviceBuf
));
}
mMemSize
=
mem_size
;
HIP_CHECK_ERROR
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
}
void
*
GetDeviceBuffer
()
const
{
return
mpDeviceBuf
;
}
std
::
size_t
GetBufferSize
()
const
{
return
mMemSize
;
}
void
ToDevice
(
const
void
*
p
)
const
{
if
(
mpDeviceBuf
)
{
HIP_CHECK_ERROR
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
}
else
{
throw
std
::
runtime_error
(
"ToDevice with an empty pointer"
);
}
}
void
ToDevice
(
const
void
*
p
,
const
std
::
size_t
cpySize
)
const
{
HIP_CHECK_ERROR
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
cpySize
,
hipMemcpyHostToDevice
));
}
void
FromDevice
(
void
*
p
)
const
{
if
(
mpDeviceBuf
)
{
HIP_CHECK_ERROR
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
}
else
{
throw
std
::
runtime_error
(
"FromDevice with an empty pointer"
);
}
}
void
FromDevice
(
void
*
p
,
const
std
::
size_t
cpySize
)
const
{
HIP_CHECK_ERROR
(
hipMemcpy
(
p
,
mpDeviceBuf
,
cpySize
,
hipMemcpyDeviceToHost
));
}
void
SetZero
()
const
{
if
(
mpDeviceBuf
)
{
HIP_CHECK_ERROR
(
hipMemset
(
mpDeviceBuf
,
0
,
mMemSize
));
}
}
template
<
typename
T
>
void
SetValue
(
T
x
)
const
{
if
(
mMemSize
%
sizeof
(
T
)
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! not entire DeviceMem will be set"
);
}
// TODO: call a gpu kernel to set the value (?)
set_buffer_value
<
T
><<<
1
,
1024
>>>
(
static_cast
<
T
*>
(
mpDeviceBuf
),
x
,
mMemSize
/
sizeof
(
T
));
}
~
DeviceMem
()
{
if
(
mpDeviceBuf
)
{
try
{
HIP_CHECK_ERROR
(
hipFree
(
mpDeviceBuf
));
}
catch
(
std
::
runtime_error
&
re
)
{
std
::
cerr
<<
re
.
what
()
<<
std
::
endl
;
}
}
}
void
*
mpDeviceBuf
;
std
::
size_t
mMemSize
;
};
}
// namespace ck_tile
include/ck_tile/host/fill.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cmath>
#include <iterator>
#include <optional>
#include <random>
#include <type_traits>
#include <utility>
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
typename
T
>
struct
FillUniformDistribution
{
float
a_
{
-
5.
f
};
float
b_
{
5.
f
};
std
::
optional
<
uint32_t
>
seed_
{
11939
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
(
seed_
.
has_value
()
?
*
seed_
:
std
::
random_device
{}());
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck_tile
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillUniformDistribution
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
template
<
typename
T
>
struct
FillNormalDistribution
{
float
mean_
{
0.
f
};
float
variance_
{
1.
f
};
std
::
optional
<
uint32_t
>
seed_
{
11939
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
(
seed_
.
has_value
()
?
*
seed_
:
std
::
random_device
{}());
std
::
normal_distribution
<
float
>
dis
(
mean_
,
std
::
sqrt
(
variance_
));
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck_tile
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillNormalDistribution
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// However this produces segfaults in std::mt19937 which look like inifite loop.
// template <typename T>
// struct FillUniformDistributionIntegerValue
// {
// int a_{-5};
// int b_{5};
//
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
// }
// };
// Workaround for uniform_int_distribution not working as expected. See note above.<
template
<
typename
T
>
struct
FillUniformDistributionIntegerValue
{
float
a_
{
-
5.
f
};
float
b_
{
5.
f
};
std
::
optional
<
uint32_t
>
seed_
{
11939
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
(
seed_
.
has_value
()
?
*
seed_
:
std
::
random_device
{}());
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck_tile
::
type_convert
<
T
>
(
std
::
round
(
dis
(
gen
)));
});
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillUniformDistributionIntegerValue
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
template
<
typename
T
>
struct
FillNormalDistributionIntegerValue
{
float
mean_
{
0.
f
};
float
variance_
{
1.
f
};
std
::
optional
<
uint32_t
>
seed_
{
11939
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
mt19937
gen
(
seed_
.
has_value
()
?
*
seed_
:
std
::
random_device
{}());
std
::
normal_distribution
<
float
>
dis
(
mean_
,
std
::
sqrt
(
variance_
));
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck_tile
::
type_convert
<
T
>
(
std
::
round
(
dis
(
gen
)));
});
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillNormalDistributionIntegerValue
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
template
<
typename
T
>
struct
FillMonotonicSeq
{
T
init_value_
{
0
};
T
step_
{
1
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
generate
(
first
,
last
,
[
=
,
n
=
init_value_
]()
mutable
{
auto
tmp
=
n
;
n
+=
step_
;
return
tmp
;
});
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillMonotonicSeq
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
template
<
typename
T
>
struct
FillConstant
{
T
value_
{
0
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
std
::
fill
(
first
,
last
,
value_
);
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillConstant
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
template
<
typename
T
,
bool
UseCos
=
true
,
bool
UseAbs
=
false
>
struct
FillTrigValue
{
template
<
typename
T_
,
bool
UseCos_
=
true
,
bool
UseAbs_
=
false
>
struct
LinearTrigGen
{
int
i
{
0
};
auto
operator
()()
{
float
v
=
0
;
if
constexpr
(
UseCos_
)
{
v
=
cos
(
i
);
}
else
{
v
=
sin
(
i
);
}
if
constexpr
(
UseAbs_
)
v
=
abs
(
v
);
i
++
;
return
ck_tile
::
type_convert
<
T_
>
(
v
);
}
};
template
<
typename
ForwardIter
>
void
operator
()(
ForwardIter
first
,
ForwardIter
last
)
const
{
LinearTrigGen
<
T
,
UseCos
,
UseAbs
>
gen
;
std
::
generate
(
first
,
last
,
gen
);
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
const
->
std
::
void_t
<
decltype
(
std
::
declval
<
const
FillTrigValue
&>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
}
// namespace ck_tile
include/ck_tile/host/hip_check_error.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
#include <sstream>
#include <stdexcept>
#include <hip/hip_runtime.h>
namespace
ck_tile
{
// To be removed, which really does not tell the location of failed HIP functional call
CK_TILE_HOST
void
hip_check_error
(
hipError_t
x
)
{
if
(
x
!=
hipSuccess
)
{
std
::
ostringstream
ss
;
ss
<<
"HIP runtime error: "
<<
hipGetErrorString
(
x
)
<<
". "
<<
__FILE__
<<
": "
<<
__LINE__
<<
"in function: "
<<
__func__
;
throw
std
::
runtime_error
(
ss
.
str
());
}
}
}
// namespace ck_tile
#define HIP_CHECK_ERROR(retval_or_funcall) \
do \
{ \
hipError_t _tmpVal = retval_or_funcall; \
if(_tmpVal != hipSuccess) \
{ \
std::ostringstream ostr; \
ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
<< hipGetErrorString(_tmpVal); \
throw std::runtime_error(ostr.str()); \
} \
} while(0)
include/ck_tile/host/host_tensor.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cassert>
#include <iostream>
#include <iomanip>
#include <numeric>
#include <thread>
#include <utility>
#include <vector>
#include "ck_tile/core.hpp"
#include "ck_tile/host/ranges.hpp"
namespace
ck_tile
{
template
<
typename
Range
>
CK_TILE_HOST
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
range
,
std
::
string
delim
,
int
precision
=
std
::
cout
.
precision
(),
int
width
=
0
)
{
bool
first
=
true
;
for
(
auto
&&
v
:
range
)
{
if
(
first
)
first
=
false
;
else
os
<<
delim
;
os
<<
std
::
setw
(
width
)
<<
std
::
setprecision
(
precision
)
<<
v
;
}
return
os
;
}
template
<
typename
T
,
typename
Range
>
CK_TILE_HOST
std
::
ostream
&
LogRangeAsType
(
std
::
ostream
&
os
,
Range
&&
range
,
std
::
string
delim
,
int
precision
=
std
::
cout
.
precision
(),
int
width
=
0
)
{
bool
first
=
true
;
for
(
auto
&&
v
:
range
)
{
if
(
first
)
first
=
false
;
else
os
<<
delim
;
os
<<
std
::
setw
(
width
)
<<
std
::
setprecision
(
precision
)
<<
static_cast
<
T
>
(
v
);
}
return
os
;
}
template
<
typename
F
,
typename
T
,
std
::
size_t
...
Is
>
CK_TILE_HOST
auto
call_f_unpack_args_impl
(
F
f
,
T
args
,
std
::
index_sequence
<
Is
...
>
)
{
return
f
(
std
::
get
<
Is
>
(
args
)...);
}
template
<
typename
F
,
typename
T
>
CK_TILE_HOST
auto
call_f_unpack_args
(
F
f
,
T
args
)
{
constexpr
std
::
size_t
N
=
std
::
tuple_size
<
T
>
{};
return
call_f_unpack_args_impl
(
f
,
args
,
std
::
make_index_sequence
<
N
>
{});
}
template
<
typename
F
,
typename
T
,
std
::
size_t
...
Is
>
CK_TILE_HOST
auto
construct_f_unpack_args_impl
(
T
args
,
std
::
index_sequence
<
Is
...
>
)
{
return
F
(
std
::
get
<
Is
>
(
args
)...);
}
template
<
typename
F
,
typename
T
>
CK_TILE_HOST
auto
construct_f_unpack_args
(
F
,
T
args
)
{
constexpr
std
::
size_t
N
=
std
::
tuple_size
<
T
>
{};
return
construct_f_unpack_args_impl
<
F
>
(
args
,
std
::
make_index_sequence
<
N
>
{});
}
struct
HostTensorDescriptor
{
HostTensorDescriptor
()
=
default
;
void
CalculateStrides
()
{
mStrides
.
clear
();
mStrides
.
resize
(
mLens
.
size
(),
0
);
if
(
mStrides
.
empty
())
return
;
mStrides
.
back
()
=
1
;
std
::
partial_sum
(
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
}
template
<
typename
X
,
typename
=
std
::
enable_if_t
<
std
::
is_convertible_v
<
X
,
std
::
size_t
>
>>
HostTensorDescriptor
(
const
std
::
initializer_list
<
X
>&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
{
this
->
CalculateStrides
();
}
template
<
typename
Lengths
,
typename
=
std
::
enable_if_t
<
std
::
is_convertible_v
<
ck_tile
::
ranges
::
range_value_t
<
Lengths
>,
std
::
size_t
>>>
HostTensorDescriptor
(
const
Lengths
&
lens
)
:
mLens
(
lens
.
begin
(),
lens
.
end
())
{
this
->
CalculateStrides
();
}
template
<
typename
X
,
typename
Y
,
typename
=
std
::
enable_if_t
<
std
::
is_convertible_v
<
X
,
std
::
size_t
>
&&
std
::
is_convertible_v
<
Y
,
std
::
size_t
>>>
HostTensorDescriptor
(
const
std
::
initializer_list
<
X
>&
lens
,
const
std
::
initializer_list
<
Y
>&
strides
)
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
{
}
template
<
typename
Lengths
,
typename
Strides
,
typename
=
std
::
enable_if_t
<
std
::
is_convertible_v
<
ck_tile
::
ranges
::
range_value_t
<
Lengths
>,
std
::
size_t
>
&&
std
::
is_convertible_v
<
ck_tile
::
ranges
::
range_value_t
<
Strides
>
,
std
::
size_t
>>>
HostTensorDescriptor
(
const
Lengths
&
lens
,
const
Strides
&
strides
)
:
mLens
(
lens
.
begin
(),
lens
.
end
()),
mStrides
(
strides
.
begin
(),
strides
.
end
())
{
}
std
::
size_t
get_num_of_dimension
()
const
{
return
mLens
.
size
();
}
std
::
size_t
get_element_size
()
const
{
assert
(
mLens
.
size
()
==
mStrides
.
size
());
return
std
::
accumulate
(
mLens
.
begin
(),
mLens
.
end
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
}
std
::
size_t
get_element_space_size
()
const
{
std
::
size_t
space
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
mLens
.
size
();
++
i
)
{
if
(
mLens
[
i
]
==
0
)
continue
;
space
+=
(
mLens
[
i
]
-
1
)
*
mStrides
[
i
];
}
return
space
;
}
const
std
::
vector
<
std
::
size_t
>&
get_lengths
()
const
{
return
mLens
;
}
const
std
::
vector
<
std
::
size_t
>&
GetStrides
()
const
{
return
mStrides
;
}
template
<
typename
...
Is
>
std
::
size_t
GetOffsetFromMultiIndex
(
Is
...
is
)
const
{
assert
(
sizeof
...(
Is
)
==
this
->
get_num_of_dimension
());
std
::
initializer_list
<
std
::
size_t
>
iss
{
static_cast
<
std
::
size_t
>
(
is
)...};
return
std
::
inner_product
(
iss
.
begin
(),
iss
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
}
std
::
size_t
GetOffsetFromMultiIndex
(
std
::
vector
<
std
::
size_t
>
iss
)
const
{
return
std
::
inner_product
(
iss
.
begin
(),
iss
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
HostTensorDescriptor
&
desc
);
private:
std
::
vector
<
std
::
size_t
>
mLens
;
std
::
vector
<
std
::
size_t
>
mStrides
;
};
template
<
typename
New2Old
>
CK_TILE_HOST
HostTensorDescriptor
transpose_host_tensor_descriptor_given_new2old
(
const
HostTensorDescriptor
&
a
,
const
New2Old
&
new2old
)
{
std
::
vector
<
std
::
size_t
>
new_lengths
(
a
.
get_num_of_dimension
());
std
::
vector
<
std
::
size_t
>
new_strides
(
a
.
get_num_of_dimension
());
for
(
std
::
size_t
i
=
0
;
i
<
a
.
get_num_of_dimension
();
i
++
)
{
new_lengths
[
i
]
=
a
.
get_lengths
()[
new2old
[
i
]];
new_strides
[
i
]
=
a
.
GetStrides
()[
new2old
[
i
]];
}
return
HostTensorDescriptor
(
new_lengths
,
new_strides
);
}
struct
joinable_thread
:
std
::
thread
{
template
<
typename
...
Xs
>
joinable_thread
(
Xs
&&
...
xs
)
:
std
::
thread
(
std
::
forward
<
Xs
>
(
xs
)...)
{
}
joinable_thread
(
joinable_thread
&&
)
=
default
;
joinable_thread
&
operator
=
(
joinable_thread
&&
)
=
default
;
~
joinable_thread
()
{
if
(
this
->
joinable
())
this
->
join
();
}
};
template
<
typename
F
,
typename
...
Xs
>
struct
ParallelTensorFunctor
{
F
mF
;
static
constexpr
std
::
size_t
NDIM
=
sizeof
...(
Xs
);
std
::
array
<
std
::
size_t
,
NDIM
>
mLens
;
std
::
array
<
std
::
size_t
,
NDIM
>
mStrides
;
std
::
size_t
mN1d
;
ParallelTensorFunctor
(
F
f
,
Xs
...
xs
)
:
mF
(
f
),
mLens
({
static_cast
<
std
::
size_t
>
(
xs
)...})
{
mStrides
.
back
()
=
1
;
std
::
partial_sum
(
mLens
.
rbegin
(),
mLens
.
rend
()
-
1
,
mStrides
.
rbegin
()
+
1
,
std
::
multiplies
<
std
::
size_t
>
());
mN1d
=
mStrides
[
0
]
*
mLens
[
0
];
}
std
::
array
<
std
::
size_t
,
NDIM
>
GetNdIndices
(
std
::
size_t
i
)
const
{
std
::
array
<
std
::
size_t
,
NDIM
>
indices
;
for
(
std
::
size_t
idim
=
0
;
idim
<
NDIM
;
++
idim
)
{
indices
[
idim
]
=
i
/
mStrides
[
idim
];
i
-=
indices
[
idim
]
*
mStrides
[
idim
];
}
return
indices
;
}
void
operator
()(
std
::
size_t
num_thread
=
1
)
const
{
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
mN1d
);
auto
f
=
[
this
,
iw_begin
,
iw_end
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
call_f_unpack_args
(
this
->
mF
,
this
->
GetNdIndices
(
iw
));
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
}
};
template
<
typename
F
,
typename
...
Xs
>
CK_TILE_HOST
auto
make_ParallelTensorFunctor
(
F
f
,
Xs
...
xs
)
{
return
ParallelTensorFunctor
<
F
,
Xs
...
>
(
f
,
xs
...);
}
template
<
typename
T
>
struct
HostTensor
{
using
Descriptor
=
HostTensorDescriptor
;
using
Data
=
std
::
vector
<
T
>
;
template
<
typename
X
>
HostTensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
get_element_space_size
())
{
}
template
<
typename
X
,
typename
Y
>
HostTensor
(
std
::
initializer_list
<
X
>
lens
,
std
::
initializer_list
<
Y
>
strides
)
:
mDesc
(
lens
,
strides
),
mData
(
mDesc
.
get_element_space_size
())
{
}
template
<
typename
Lengths
>
HostTensor
(
const
Lengths
&
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
get_element_space_size
())
{
}
template
<
typename
Lengths
,
typename
Strides
>
HostTensor
(
const
Lengths
&
lens
,
const
Strides
&
strides
)
:
mDesc
(
lens
,
strides
),
mData
(
get_element_space_size
())
{
}
HostTensor
(
const
Descriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
get_element_space_size
())
{}
template
<
typename
OutT
>
HostTensor
<
OutT
>
CopyAsType
()
const
{
HostTensor
<
OutT
>
ret
(
mDesc
);
std
::
transform
(
mData
.
cbegin
(),
mData
.
cend
(),
ret
.
mData
.
begin
(),
[](
auto
value
)
{
return
ck_tile
::
type_convert
<
OutT
>
(
value
);
});
return
ret
;
}
HostTensor
()
=
delete
;
HostTensor
(
const
HostTensor
&
)
=
default
;
HostTensor
(
HostTensor
&&
)
=
default
;
~
HostTensor
()
=
default
;
HostTensor
&
operator
=
(
const
HostTensor
&
)
=
default
;
HostTensor
&
operator
=
(
HostTensor
&&
)
=
default
;
template
<
typename
FromT
>
explicit
HostTensor
(
const
HostTensor
<
FromT
>&
other
)
:
HostTensor
(
other
.
template
CopyAsType
<
T
>())
{
}
decltype
(
auto
)
get_lengths
()
const
{
return
mDesc
.
get_lengths
();
}
decltype
(
auto
)
GetStrides
()
const
{
return
mDesc
.
GetStrides
();
}
std
::
size_t
get_num_of_dimension
()
const
{
return
mDesc
.
get_num_of_dimension
();
}
std
::
size_t
get_element_size
()
const
{
return
mDesc
.
get_element_size
();
}
std
::
size_t
get_element_space_size
()
const
{
return
mDesc
.
get_element_space_size
();
}
std
::
size_t
get_element_space_size_in_bytes
()
const
{
return
sizeof
(
T
)
*
get_element_space_size
();
}
// void SetZero() { ck_tile::ranges::fill<T>(mData, 0); }
void
SetZero
()
{
std
::
fill
(
mData
.
begin
(),
mData
.
end
(),
0
);
}
template
<
typename
F
>
void
ForEach_impl
(
F
&&
f
,
std
::
vector
<
size_t
>&
idx
,
size_t
rank
)
{
if
(
rank
==
mDesc
.
get_num_of_dimension
())
{
f
(
*
this
,
idx
);
return
;
}
// else
for
(
size_t
i
=
0
;
i
<
mDesc
.
get_lengths
()[
rank
];
i
++
)
{
idx
[
rank
]
=
i
;
ForEach_impl
(
std
::
forward
<
F
>
(
f
),
idx
,
rank
+
1
);
}
}
template
<
typename
F
>
void
ForEach
(
F
&&
f
)
{
std
::
vector
<
size_t
>
idx
(
mDesc
.
get_num_of_dimension
(),
0
);
ForEach_impl
(
std
::
forward
<
F
>
(
f
),
idx
,
size_t
(
0
));
}
template
<
typename
F
>
void
ForEach_impl
(
const
F
&&
f
,
std
::
vector
<
size_t
>&
idx
,
size_t
rank
)
const
{
if
(
rank
==
mDesc
.
get_num_of_dimension
())
{
f
(
*
this
,
idx
);
return
;
}
// else
for
(
size_t
i
=
0
;
i
<
mDesc
.
get_lengths
()[
rank
];
i
++
)
{
idx
[
rank
]
=
i
;
ForEach_impl
(
std
::
forward
<
const
F
>
(
f
),
idx
,
rank
+
1
);
}
}
template
<
typename
F
>
void
ForEach
(
const
F
&&
f
)
const
{
std
::
vector
<
size_t
>
idx
(
mDesc
.
get_num_of_dimension
(),
0
);
ForEach_impl
(
std
::
forward
<
const
F
>
(
f
),
idx
,
size_t
(
0
));
}
template
<
typename
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
switch
(
mDesc
.
get_num_of_dimension
())
{
case
1
:
{
auto
f
=
[
&
](
auto
i
)
{
(
*
this
)(
i
)
=
g
(
i
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
get_lengths
()[
0
])(
num_thread
);
break
;
}
case
2
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
)
{
(
*
this
)(
i0
,
i1
)
=
g
(
i0
,
i1
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
get_lengths
()[
0
],
mDesc
.
get_lengths
()[
1
])(
num_thread
);
break
;
}
case
3
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
)
{
(
*
this
)(
i0
,
i1
,
i2
)
=
g
(
i0
,
i1
,
i2
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
get_lengths
()[
0
],
mDesc
.
get_lengths
()[
1
],
mDesc
.
get_lengths
()[
2
])(
num_thread
);
break
;
}
case
4
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
)
{
(
*
this
)(
i0
,
i1
,
i2
,
i3
)
=
g
(
i0
,
i1
,
i2
,
i3
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
get_lengths
()[
0
],
mDesc
.
get_lengths
()[
1
],
mDesc
.
get_lengths
()[
2
],
mDesc
.
get_lengths
()[
3
])(
num_thread
);
break
;
}
case
5
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
,
auto
i4
)
{
(
*
this
)(
i0
,
i1
,
i2
,
i3
,
i4
)
=
g
(
i0
,
i1
,
i2
,
i3
,
i4
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
get_lengths
()[
0
],
mDesc
.
get_lengths
()[
1
],
mDesc
.
get_lengths
()[
2
],
mDesc
.
get_lengths
()[
3
],
mDesc
.
get_lengths
()[
4
])(
num_thread
);
break
;
}
case
6
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
,
auto
i4
,
auto
i5
)
{
(
*
this
)(
i0
,
i1
,
i2
,
i3
,
i4
,
i5
)
=
g
(
i0
,
i1
,
i2
,
i3
,
i4
,
i5
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
get_lengths
()[
0
],
mDesc
.
get_lengths
()[
1
],
mDesc
.
get_lengths
()[
2
],
mDesc
.
get_lengths
()[
3
],
mDesc
.
get_lengths
()[
4
],
mDesc
.
get_lengths
()[
5
])(
num_thread
);
break
;
}
default:
throw
std
::
runtime_error
(
"unspported dimension"
);
}
}
template
<
typename
...
Is
>
std
::
size_t
GetOffsetFromMultiIndex
(
Is
...
is
)
const
{
return
mDesc
.
GetOffsetFromMultiIndex
(
is
...);
}
template
<
typename
...
Is
>
T
&
operator
()(
Is
...
is
)
{
return
mData
[
mDesc
.
GetOffsetFromMultiIndex
(
is
...)];
}
template
<
typename
...
Is
>
const
T
&
operator
()(
Is
...
is
)
const
{
return
mData
[
mDesc
.
GetOffsetFromMultiIndex
(
is
...)];
}
T
&
operator
()(
std
::
vector
<
std
::
size_t
>
idx
)
{
return
mData
[
mDesc
.
GetOffsetFromMultiIndex
(
idx
)];
}
const
T
&
operator
()(
std
::
vector
<
std
::
size_t
>
idx
)
const
{
return
mData
[
mDesc
.
GetOffsetFromMultiIndex
(
idx
)];
}
typename
Data
::
iterator
begin
()
{
return
mData
.
begin
();
}
typename
Data
::
iterator
end
()
{
return
mData
.
end
();
}
typename
Data
::
pointer
data
()
{
return
mData
.
data
();
}
typename
Data
::
const_iterator
begin
()
const
{
return
mData
.
begin
();
}
typename
Data
::
const_iterator
end
()
const
{
return
mData
.
end
();
}
typename
Data
::
const_pointer
data
()
const
{
return
mData
.
data
();
}
typename
Data
::
size_type
size
()
const
{
return
mData
.
size
();
}
template
<
typename
U
=
T
>
auto
AsSpan
()
const
{
constexpr
std
::
size_t
FromSize
=
sizeof
(
T
);
constexpr
std
::
size_t
ToSize
=
sizeof
(
U
);
using
Element
=
std
::
add_const_t
<
std
::
remove_reference_t
<
U
>>
;
return
ck_tile
::
span
<
Element
>
{
reinterpret_cast
<
Element
*>
(
data
()),
size
()
*
FromSize
/
ToSize
};
}
template
<
typename
U
=
T
>
auto
AsSpan
()
{
constexpr
std
::
size_t
FromSize
=
sizeof
(
T
);
constexpr
std
::
size_t
ToSize
=
sizeof
(
U
);
using
Element
=
std
::
remove_reference_t
<
U
>
;
return
ck_tile
::
span
<
Element
>
{
reinterpret_cast
<
Element
*>
(
data
()),
size
()
*
FromSize
/
ToSize
};
}
Descriptor
mDesc
;
Data
mData
;
};
}
// namespace ck_tile
include/ck_tile/host/kernel_launch.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include <hip/hip_runtime.h>
#include <cstddef>
namespace
ck_tile
{
template
<
int
MaxThreadPerBlock
,
int
MinBlockPerCu
,
typename
Kernel
,
typename
...
Args
>
#if CK_TILE_USE_LAUNCH_BOUNDS
__launch_bounds__
(
MaxThreadPerBlock
,
MinBlockPerCu
)
#endif
__global__
void
kentry
(
Kernel
f
,
Args
...
args
)
{
f
(
args
...);
}
template
<
typename
...
Args
,
typename
F
>
CK_TILE_HOST
float
launch_and_time_kernel
(
const
stream_config
&
s
,
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
#if CK_TILE_TIME_KERNEL
if
(
s
.
time_kernel_
)
{
// warm up
for
(
int
i
=
0
;
i
<
s
.
cold_niters_
;
++
i
)
{
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
}
const
int
nrepeat
=
s
.
nrepeat_
;
hipEvent_t
start
,
stop
;
HIP_CHECK_ERROR
(
hipEventCreate
(
&
start
));
HIP_CHECK_ERROR
(
hipEventCreate
(
&
stop
));
HIP_CHECK_ERROR
(
hipDeviceSynchronize
());
HIP_CHECK_ERROR
(
hipEventRecord
(
start
,
s
.
stream_id_
));
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
}
HIP_CHECK_ERROR
(
hipEventRecord
(
stop
,
s
.
stream_id_
));
HIP_CHECK_ERROR
(
hipEventSynchronize
(
stop
));
float
total_time
=
0
;
HIP_CHECK_ERROR
(
hipEventElapsedTime
(
&
total_time
,
start
,
stop
));
return
total_time
/
nrepeat
;
}
else
{
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
return
0
;
}
#else
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
return
0
;
#endif
}
template
<
typename
...
Args
,
typename
F
,
typename
PreProcessFunc
>
CK_TILE_HOST
float
launch_and_time_kernel_with_preprocess
(
const
stream_config
&
s
,
PreProcessFunc
preprocess
,
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
#if CK_TILE_TIME_KERNEL
if
(
s
.
time_kernel_
)
{
#if CK_TILE_DEBUG_LOG
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
grid_dim
.
x
,
grid_dim
.
y
,
grid_dim
.
z
,
block_dim
.
x
,
block_dim
.
y
,
block_dim
.
z
);
printf
(
"Warm up 1 time
\n
"
);
#endif
// warm up
preprocess
();
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
const
int
nrepeat
=
10
;
#if CK_TILE_DEBUG_LOG
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
#endif
hipEvent_t
start
,
stop
;
HIP_CHECK_ERROR
(
hipEventCreate
(
&
start
));
HIP_CHECK_ERROR
(
hipEventCreate
(
&
stop
));
HIP_CHECK_ERROR
(
hipDeviceSynchronize
());
HIP_CHECK_ERROR
(
hipEventRecord
(
start
,
s
.
stream_id_
));
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
preprocess
();
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
}
HIP_CHECK_ERROR
(
hipEventRecord
(
stop
,
s
.
stream_id_
));
HIP_CHECK_ERROR
(
hipEventSynchronize
(
stop
));
float
total_time
=
0
;
HIP_CHECK_ERROR
(
hipEventElapsedTime
(
&
total_time
,
start
,
stop
));
return
total_time
/
nrepeat
;
}
else
{
preprocess
();
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
return
0
;
}
#else
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
s
.
stream_id_
>>>
(
args
...);
hip_check_error
(
hipGetLastError
());
return
0
;
#endif
}
template
<
int
MaxThreadPerBlock
=
CK_TILE_MAX_THREAD_PER_BLOCK
,
int
MinBlockPerCu
=
CK_TILE_MIN_BLOCK_PER_CU
,
typename
KernelImpl
,
typename
...
Args
>
CK_TILE_HOST
float
launch_kernel
(
const
stream_config
&
s
,
KernelImpl
kernel_impl
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
dynamic_smem_byte
,
Args
...
args
)
{
const
auto
kernel
=
kentry
<
MaxThreadPerBlock
,
MinBlockPerCu
,
KernelImpl
,
Args
...
>
;
return
launch_and_time_kernel
(
s
,
kernel
,
grid_dim
,
block_dim
,
dynamic_smem_byte
,
kernel_impl
,
args
...);
}
}
// namespace ck_tile
include/ck_tile/host/ranges.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iterator>
#include <type_traits>
#include <utility>
// ranges implementation are not intented to be used by user
// TODO: do we need this?
namespace
ck_tile
{
template
<
typename
T
>
using
iter_value_t
=
typename
std
::
iterator_traits
<
remove_cvref_t
<
T
>>::
value_type
;
template
<
typename
T
>
using
iter_reference_t
=
decltype
(
*
std
::
declval
<
T
&>
());
template
<
typename
T
>
using
iter_difference_t
=
typename
std
::
iterator_traits
<
remove_cvref_t
<
T
>>::
difference_type
;
namespace
ranges
{
template
<
typename
R
>
using
iterator_t
=
decltype
(
std
::
begin
(
std
::
declval
<
R
&>
()));
template
<
typename
R
>
using
sentinel_t
=
decltype
(
std
::
end
(
std
::
declval
<
R
&>
()));
template
<
typename
R
>
using
range_size_t
=
decltype
(
std
::
size
(
std
::
declval
<
R
&>
()));
template
<
typename
R
>
using
range_difference_t
=
ck_tile
::
iter_difference_t
<
ranges
::
iterator_t
<
R
>>
;
template
<
typename
R
>
using
range_value_t
=
iter_value_t
<
ranges
::
iterator_t
<
R
>>
;
template
<
typename
R
>
using
range_reference_t
=
iter_reference_t
<
ranges
::
iterator_t
<
R
>>
;
template
<
typename
T
,
typename
=
void
>
struct
is_range
:
std
::
false_type
{
};
template
<
typename
T
>
struct
is_range
<
T
,
std
::
void_t
<
decltype
(
std
::
begin
(
std
::
declval
<
T
&>
())),
decltype
(
std
::
end
(
std
::
declval
<
T
&>
()))
>>
:
std
::
true_type
{
};
template
<
typename
T
>
inline
constexpr
bool
is_range_v
=
is_range
<
T
>::
value
;
template
<
typename
T
,
typename
=
void
>
struct
is_sized_range
:
std
::
false_type
{
};
template
<
typename
T
>
struct
is_sized_range
<
T
,
std
::
void_t
<
decltype
(
std
::
size
(
std
::
declval
<
T
&>
()))
>>
:
std
::
bool_constant
<
is_range_v
<
T
>>
{
};
}
// namespace ranges
}
// namespace ck_tile
include/ck_tile/host/reference/reference_batched_elementwise.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
,
typename
AElementOp
=
ck_tile
::
identity
,
typename
BElementOp
=
ck_tile
::
identity
,
typename
BinaryElementOp
=
ck_tile
::
plus
<
AccDataType
>
>
CK_TILE_HOST
void
reference_batched_elementwise
(
const
HostTensor
<
ADataType
>&
a_b_m_n
,
const
HostTensor
<
BDataType
>&
b_b_m_n
,
HostTensor
<
CDataType
>&
c_b_m_n
,
const
AElementOp
&
a_element_op
=
{},
const
BElementOp
&
b_element_op
=
{},
const
BinaryElementOp
&
binary_element_op
=
{})
{
const
ck_tile
::
index_t
N
=
c_b_m_n
.
mDesc
.
get_lengths
()[
2
];
const
bool
broadcast_a_dim_b
=
(
a_b_m_n
.
get_lengths
()[
0
]
==
1
);
const
bool
broadcast_a_dim_m
=
(
a_b_m_n
.
get_lengths
()[
1
]
==
1
);
const
bool
broadcast_a_dim_n
=
(
a_b_m_n
.
get_lengths
()[
2
]
==
1
);
const
bool
broadcast_b_dim_b
=
(
b_b_m_n
.
get_lengths
()[
0
]
==
1
);
const
bool
broadcast_b_dim_m
=
(
b_b_m_n
.
get_lengths
()[
1
]
==
1
);
const
bool
broadcast_b_dim_n
=
(
b_b_m_n
.
get_lengths
()[
2
]
==
1
);
auto
f
=
[
&
](
auto
batch
,
auto
m
)
{
for
(
ck_tile
::
index_t
n
=
0
;
n
<
N
;
++
n
)
{
AccDataType
v_a
{};
{
ck_tile
::
index_t
i_b
=
(
broadcast_a_dim_b
?
0
:
batch
);
ck_tile
::
index_t
i_m
=
(
broadcast_a_dim_m
?
0
:
m
);
ck_tile
::
index_t
i_n
=
(
broadcast_a_dim_n
?
0
:
n
);
v_a
=
ck_tile
::
type_convert
<
AccDataType
>
(
a_element_op
(
a_b_m_n
(
i_b
,
i_m
,
i_n
)));
}
AccDataType
v_b
{};
{
ck_tile
::
index_t
i_b
=
(
broadcast_b_dim_b
?
0
:
batch
);
ck_tile
::
index_t
i_m
=
(
broadcast_b_dim_m
?
0
:
m
);
ck_tile
::
index_t
i_n
=
(
broadcast_b_dim_n
?
0
:
n
);
v_b
=
ck_tile
::
type_convert
<
AccDataType
>
(
b_element_op
(
b_b_m_n
(
i_b
,
i_m
,
i_n
)));
}
c_b_m_n
(
batch
,
m
,
n
)
=
ck_tile
::
type_convert
<
CDataType
>
(
binary_element_op
(
v_a
,
v_b
));
}
};
make_ParallelTensorFunctor
(
f
,
c_b_m_n
.
mDesc
.
get_lengths
()[
0
],
c_b_m_n
.
mDesc
.
get_lengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_batched_gemm.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
,
typename
AElementOp
=
ck_tile
::
identity
,
typename
BElementOp
=
ck_tile
::
identity
,
typename
ACCElementOp
=
ck_tile
::
identity
>
CK_TILE_HOST
void
reference_batched_gemm
(
const
HostTensor
<
ADataType
>&
a_b_m_k
,
const
HostTensor
<
BDataType
>&
b_b_n_k
,
HostTensor
<
CDataType
>&
c_b_m_n
,
const
AElementOp
&
a_element_op
=
{},
const
BElementOp
&
b_element_op
=
{},
const
ACCElementOp
&
acc_element_op
=
{})
{
const
int
N
=
b_b_n_k
.
mDesc
.
get_lengths
()[
1
];
const
int
K
=
b_b_n_k
.
mDesc
.
get_lengths
()[
2
];
auto
f
=
[
&
](
auto
batch
,
auto
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
AccDataType
v_acc
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
ADataType
v_a
=
a_element_op
(
a_b_m_k
(
batch
,
m
,
k
));
BDataType
v_b
=
b_element_op
(
b_b_n_k
(
batch
,
n
,
k
));
v_acc
+=
ck_tile
::
type_convert
<
AccDataType
>
(
v_a
)
*
ck_tile
::
type_convert
<
AccDataType
>
(
v_b
);
}
c_b_m_n
(
batch
,
m
,
n
)
=
ck_tile
::
type_convert
<
CDataType
>
(
acc_element_op
(
v_acc
));
}
};
make_ParallelTensorFunctor
(
f
,
c_b_m_n
.
mDesc
.
get_lengths
()[
0
],
c_b_m_n
.
mDesc
.
get_lengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_batched_masking.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
CDataType
,
typename
MaskingType
>
CK_TILE_HOST
void
reference_batched_masking
(
HostTensor
<
CDataType
>&
c_b_m_n
,
const
MaskingType
&
mask
)
{
const
int
M
=
c_b_m_n
.
mDesc
.
get_lengths
()[
1
];
const
int
N
=
c_b_m_n
.
mDesc
.
get_lengths
()[
2
];
auto
f
=
[
&
](
auto
batch
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
if
(
mask
.
IsOutOfBound
(
m
,
n
))
c_b_m_n
(
batch
,
m
,
n
)
=
-
ck_tile
::
numeric
<
CDataType
>::
infinity
();
}
}
};
make_ParallelTensorFunctor
(
f
,
c_b_m_n
.
mDesc
.
get_lengths
()[
0
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_batched_softmax.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
ADataType
,
typename
CompDataType
,
typename
BDataType
,
typename
CompElementOp
=
ck_tile
::
identity
>
CK_TILE_HOST
void
reference_batched_softmax
(
const
HostTensor
<
ADataType
>&
a_b_m_n
,
HostTensor
<
BDataType
>&
b_b_m_n
,
const
CompElementOp
&
comp_element_op
=
{},
std
::
optional
<
std
::
reference_wrapper
<
HostTensor
<
CompDataType
>>>
lse_b_m
=
std
::
nullopt
)
{
const
int
N
=
a_b_m_n
.
mDesc
.
get_lengths
()[
2
];
auto
f
=
[
&
](
auto
batch
,
auto
m
)
{
CompDataType
v_max
=
-
ck_tile
::
numeric
<
CompDataType
>::
infinity
();
// max
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
CompDataType
v_a
=
ck_tile
::
type_convert
<
CompDataType
>
(
a_b_m_n
(
batch
,
m
,
n
));
v_max
=
v_max
<
v_a
?
v_a
:
v_max
;
}
CompDataType
v_exp_sum
=
0
;
// validate v_max if all the elements within a row are -INF
if
(
std
::
isinf
(
v_max
)
&&
v_max
<
0
)
{
v_max
=
ck_tile
::
type_convert
<
CompDataType
>
(
0.
f
);
}
// sum
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
CompDataType
v_a
=
ck_tile
::
type_convert
<
CompDataType
>
(
a_b_m_n
(
batch
,
m
,
n
));
v_exp_sum
+=
ck_tile
::
exp
(
v_a
-
v_max
);
}
// if sum is zero(masked), or nan/inf(other computation error), don't do divide
CompDataType
inv_sum
=
(
v_exp_sum
==
0.
f
?
1.
f
:
1.
f
/
v_exp_sum
);
// elementwise
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
CompDataType
v_a
=
ck_tile
::
type_convert
<
CompDataType
>
(
a_b_m_n
(
batch
,
m
,
n
));
const
CompDataType
v_b
=
ck_tile
::
exp
(
v_a
-
v_max
)
*
inv_sum
;
b_b_m_n
(
batch
,
m
,
n
)
=
ck_tile
::
type_convert
<
BDataType
>
(
comp_element_op
(
v_b
));
}
// lse
if
(
lse_b_m
)
{
lse_b_m
->
get
()(
batch
,
m
)
=
v_max
+
ck_tile
::
log
(
v_exp_sum
);
}
};
make_ParallelTensorFunctor
(
f
,
b_b_m_n
.
mDesc
.
get_lengths
()[
0
],
b_b_m_n
.
mDesc
.
get_lengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_gemm.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
,
typename
AElementOp
=
ck_tile
::
identity
,
typename
BElementOp
=
ck_tile
::
identity
,
typename
ACCElementOp
=
ck_tile
::
identity
>
CK_TILE_HOST
void
reference_gemm
(
const
HostTensor
<
ADataType
>&
a_m_k
,
const
HostTensor
<
BDataType
>&
b_n_k
,
HostTensor
<
CDataType
>&
c_m_n
,
const
AElementOp
&
a_element_op
=
{},
const
BElementOp
&
b_element_op
=
{},
const
ACCElementOp
&
acc_element_op
=
{})
{
const
int
N
=
b_n_k
.
mDesc
.
get_lengths
()[
0
];
const
int
K
=
b_n_k
.
mDesc
.
get_lengths
()[
1
];
auto
f
=
[
&
](
auto
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
AccDataType
v_acc
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
ADataType
v_a
=
a_element_op
(
a_m_k
(
m
,
k
));
BDataType
v_b
=
b_element_op
(
b_n_k
(
n
,
k
));
v_acc
+=
ck_tile
::
type_convert
<
AccDataType
>
(
v_a
)
*
ck_tile
::
type_convert
<
AccDataType
>
(
v_b
);
}
c_m_n
(
m
,
n
)
=
ck_tile
::
type_convert
<
CDataType
>
(
acc_element_op
(
v_acc
));
}
};
make_ParallelTensorFunctor
(
f
,
c_m_n
.
mDesc
.
get_lengths
()[
0
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_im2col.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
T
>
CK_TILE_HOST
void
reference_im2col
(
HostTensor
<
T
>&
in_mtx_host_ref
,
const
HostTensor
<
T
>&
in_host
,
int
/*N*/
,
int
/*K*/
,
int
C
,
int
/*Y*/
,
int
X
,
int
Hi
,
int
Wi
,
int
Ho
,
int
Wo
,
int
ConvStrideH
,
int
ConvStrideW
,
int
ConvDilationH
,
int
ConvDilationW
,
int
InLeftPadH
,
int
InLeftPadW
,
int
/*InRightPadH*/
,
int
/*InRightPadW*/
)
{
int
GemmM
=
in_mtx_host_ref
.
get_lengths
()[
0
];
int
GemmK
=
in_mtx_host_ref
.
get_lengths
()[
1
];
for
(
int
gemm_m
=
0
;
gemm_m
<
GemmM
;
++
gemm_m
)
{
int
mtmp
=
gemm_m
;
int
n
=
mtmp
/
(
Ho
*
Wo
);
mtmp
-=
n
*
Ho
*
Wo
;
int
ho
=
mtmp
/
Wo
;
int
wo
=
mtmp
-
ho
*
Wo
;
for
(
int
gemm_k
=
0
;
gemm_k
<
GemmK
;
++
gemm_k
)
{
int
ktmp
=
gemm_k
;
int
y
=
ktmp
/
(
X
*
C
);
ktmp
-=
y
*
X
*
C
;
int
x
=
ktmp
/
C
;
int
c
=
ktmp
-
x
*
C
;
int
hi
=
y
*
ConvDilationH
+
ho
*
ConvStrideH
-
InLeftPadH
;
int
wi
=
x
*
ConvDilationW
+
wo
*
ConvStrideW
-
InLeftPadW
;
bool
inbound
=
(
hi
>=
0
&&
hi
<
Hi
&&
wi
>=
0
&&
wi
<
Wi
);
in_mtx_host_ref
(
gemm_m
,
gemm_k
)
=
inbound
?
in_host
(
n
,
hi
,
wi
,
c
)
:
0
;
}
}
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_reduce.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
ADataType
,
typename
AccDataType
,
typename
BDataType
>
CK_TILE_HOST
void
reference_reduce
(
const
HostTensor
<
ADataType
>&
a_m_n
,
HostTensor
<
BDataType
>&
b_m
)
{
auto
f
=
[
&
](
auto
m
)
{
const
int
N
=
a_m_n
.
mDesc
.
get_lengths
()[
1
];
AccDataType
v_acc
=
0
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
ADataType
v_a
=
a_m_n
(
m
,
n
);
v_acc
+=
v_a
;
}
b_m
(
m
)
=
ck_tile
::
type_convert
<
BDataType
>
(
v_acc
);
};
make_ParallelTensorFunctor
(
f
,
b_m
.
mDesc
.
get_lengths
()[
0
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_softmax.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
ADataType
,
typename
AccDataType
,
typename
BDataType
>
CK_TILE_HOST
void
reference_softmax
(
const
HostTensor
<
ADataType
>&
a_m_n
,
HostTensor
<
BDataType
>&
b_m_n
)
{
auto
f
=
[
&
](
auto
m
)
{
const
int
N
=
a_m_n
.
mDesc
.
get_lengths
()[
1
];
AccDataType
v_max
=
ck_tile
::
numeric
<
ADataType
>::
Lowest
();
// max
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
ADataType
v_a
=
a_m_n
(
m
,
n
);
v_max
=
v_max
<
v_a
?
v_a
:
v_max
;
}
AccDataType
v_exp_sum
=
0
;
// sum
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
ADataType
v_a
=
a_m_n
(
m
,
n
);
v_exp_sum
+=
ck_tile
::
exp
(
v_a
-
v_max
);
}
// elementwise
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
ADataType
v_a
=
a_m_n
(
m
,
n
);
b_m_n
(
m
,
n
)
=
ck_tile
::
exp
(
v_a
-
v_max
)
/
v_exp_sum
;
}
};
make_ParallelTensorFunctor
(
f
,
b_m_n
.
mDesc
.
get_lengths
()[
0
])(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/stream_config.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
namespace
ck_tile
{
struct
stream_config
{
hipStream_t
stream_id_
=
nullptr
;
bool
time_kernel_
=
false
;
int
log_level_
=
0
;
int
cold_niters_
=
3
;
int
nrepeat_
=
10
;
};
}
// namespace ck_tile
include/ck_tile/ops/common.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/common/tensor_layout.hpp"
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment