Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
10127959
Commit
10127959
authored
Mar 08, 2024
by
illsilin
Browse files
Merge branch 'navi3_rel' of github.com:ROCm/composable_kernel into navi3_rel
parents
7b28bcb3
32371ea5
Changes
40
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1785 additions
and
123 deletions
+1785
-123
codegen/test/include/test.hpp
codegen/test/include/test.hpp
+848
-0
codegen/test/rtc/CMakeLists.txt
codegen/test/rtc/CMakeLists.txt
+6
-0
codegen/test/rtc/include/rtc/compile_kernel.hpp
codegen/test/rtc/include/rtc/compile_kernel.hpp
+27
-0
codegen/test/rtc/include/rtc/hip.hpp
codegen/test/rtc/include/rtc/hip.hpp
+78
-0
codegen/test/rtc/include/rtc/kernel.hpp
codegen/test/rtc/include/rtc/kernel.hpp
+62
-0
codegen/test/rtc/include/rtc/manage_ptr.hpp
codegen/test/rtc/include/rtc/manage_ptr.hpp
+55
-0
codegen/test/rtc/include/rtc/tmp_dir.hpp
codegen/test/rtc/include/rtc/tmp_dir.hpp
+24
-0
codegen/test/rtc/src/compile_kernel.cpp
codegen/test/rtc/src/compile_kernel.cpp
+95
-0
codegen/test/rtc/src/hip.cpp
codegen/test/rtc/src/hip.cpp
+102
-0
codegen/test/rtc/src/kernel.cpp
codegen/test/rtc/src/kernel.cpp
+121
-0
codegen/test/rtc/src/tmp_dir.cpp
codegen/test/rtc/src/tmp_dir.cpp
+48
-0
docs/dockerhub.rst
docs/dockerhub.rst
+1
-1
docs/sphinx/requirements.in
docs/sphinx/requirements.in
+1
-1
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+1
-1
example/01_gemm/gemm_xdl_fp8.cpp
example/01_gemm/gemm_xdl_fp8.cpp
+9
-5
example/01_gemm/gemm_xdl_fp8_bf8.cpp
example/01_gemm/gemm_xdl_fp8_bf8.cpp
+4
-4
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
...n/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+237
-64
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+29
-24
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
...ration/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+5
-5
include/ck/utility/type_convert.hpp
include/ck/utility/type_convert.hpp
+32
-18
No files found.
codegen/test/include/test.hpp
0 → 100644
View file @
10127959
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <atomic>
#include <algorithm>
#include <array>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <functional>
#include <iostream>
#include <sstream>
#include <type_traits>
#include <unordered_map>
#include <vector>
#ifdef __linux__
#include <unistd.h>
#endif
#ifndef MIGRAPHX_GUARD_TEST_TEST_HPP
#define MIGRAPHX_GUARD_TEST_TEST_HPP
namespace
test
{
// clang-format off
// NOLINTNEXTLINE
#define TEST_FOREACH_BINARY_OPERATORS(m) \
m(==, equal) \
m(!=, not_equal) \
m(<=, less_than_equal) \
m(>=, greater_than_equal) \
m(<, less_than) \
m(>, greater_than) \
m(and, and_op) \
m(or, or_op)
// clang-format on
// clang-format off
// NOLINTNEXTLINE
#define TEST_FOREACH_UNARY_OPERATORS(m) \
m(not, not_op)
// clang-format on
// NOLINTNEXTLINE
#define TEST_EACH_BINARY_OPERATOR_OBJECT(op, name) \
struct name \
{ \
static std::string as_string() { return #op; } \
template <class T, class U> \
static decltype(auto) call(T&& x, U&& y) \
{ \
return x op y; \
} \
};
// NOLINTNEXTLINE
#define TEST_EACH_UNARY_OPERATOR_OBJECT(op, name) \
struct name \
{ \
static std::string as_string() { return #op; } \
template <class T> \
static decltype(auto) call(T&& x) \
{ \
return op x; \
} \
};
TEST_FOREACH_BINARY_OPERATORS
(
TEST_EACH_BINARY_OPERATOR_OBJECT
)
TEST_FOREACH_UNARY_OPERATORS
(
TEST_EACH_UNARY_OPERATOR_OBJECT
)
struct
nop
{
static
std
::
string
as_string
()
{
return
""
;
}
template
<
class
T
>
static
auto
call
(
T
&&
x
)
{
return
static_cast
<
T
&&>
(
x
);
}
};
struct
function
{
static
std
::
string
as_string
()
{
return
""
;
}
template
<
class
T
>
static
decltype
(
auto
)
call
(
T
&&
x
)
{
return
x
();
}
};
template
<
class
Stream
,
class
Iterator
>
Stream
&
stream_range
(
Stream
&
s
,
Iterator
start
,
Iterator
last
);
template
<
class
Stream
>
inline
Stream
&
operator
<<
(
Stream
&
s
,
std
::
nullptr_t
)
{
s
<<
"nullptr"
;
return
s
;
}
template
<
class
Stream
,
class
Range
,
class
=
typename
std
::
enable_if
<
not
std
::
is_convertible
<
Range
,
std
::
string
>{}
>::
type
>
inline
auto
operator
<<
(
Stream
&
s
,
const
Range
&
v
)
->
decltype
(
stream_range
(
s
,
v
.
begin
(),
v
.
end
()))
{
s
<<
"{ "
;
stream_range
(
s
,
v
.
begin
(),
v
.
end
());
s
<<
"}"
;
return
s
;
}
template
<
class
Stream
,
class
Iterator
>
inline
Stream
&
stream_range
(
Stream
&
s
,
Iterator
start
,
Iterator
last
)
{
if
(
start
!=
last
)
{
s
<<
*
start
;
std
::
for_each
(
std
::
next
(
start
),
last
,
[
&
](
auto
&&
x
)
{
s
<<
", "
<<
x
;
});
}
return
s
;
}
template
<
class
T
>
const
T
&
get_value
(
const
T
&
x
)
{
return
x
;
}
template
<
class
T
,
class
Operator
=
nop
>
struct
lhs_expression
;
template
<
class
T
>
lhs_expression
<
T
>
make_lhs_expression
(
T
&&
lhs
);
template
<
class
T
,
class
Operator
>
lhs_expression
<
T
,
Operator
>
make_lhs_expression
(
T
&&
lhs
,
Operator
);
// NOLINTNEXTLINE
#define TEST_EXPR_BINARY_OPERATOR(op, name) \
template <class V> \
auto operator op(const V& rhs2) const \
{ \
return make_expression(*this, rhs2, name{});
/* NOLINT */
\
}
// NOLINTNEXTLINE
#define TEST_EXPR_UNARY_OPERATOR(op, name) \
auto operator op() const { return make_lhs_expression(lhs, name{});
/* NOLINT */
}
template
<
class
T
,
class
U
,
class
Operator
>
struct
expression
{
T
lhs
;
U
rhs
;
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
expression
&
self
)
{
s
<<
self
.
lhs
<<
" "
<<
Operator
::
as_string
()
<<
" "
<<
self
.
rhs
;
return
s
;
}
friend
decltype
(
auto
)
get_value
(
const
expression
&
e
)
{
return
e
.
value
();
}
decltype
(
auto
)
value
()
const
{
return
Operator
::
call
(
get_value
(
lhs
),
get_value
(
rhs
));
};
TEST_FOREACH_UNARY_OPERATORS
(
TEST_EXPR_UNARY_OPERATOR
)
TEST_FOREACH_BINARY_OPERATORS
(
TEST_EXPR_BINARY_OPERATOR
)
};
// TODO: Remove rvalue references
template
<
class
T
,
class
U
,
class
Operator
>
expression
<
T
,
U
,
Operator
>
make_expression
(
T
&&
rhs
,
U
&&
lhs
,
Operator
)
{
return
{
std
::
forward
<
T
>
(
rhs
),
std
::
forward
<
U
>
(
lhs
)};
}
// TODO: Remove rvalue reference
template
<
class
T
>
lhs_expression
<
T
>
make_lhs_expression
(
T
&&
lhs
)
{
return
lhs_expression
<
T
>
{
std
::
forward
<
T
>
(
lhs
)};
}
template
<
class
T
,
class
Operator
>
lhs_expression
<
T
,
Operator
>
make_lhs_expression
(
T
&&
lhs
,
Operator
)
{
return
lhs_expression
<
T
,
Operator
>
{
std
::
forward
<
T
>
(
lhs
)};
}
template
<
class
T
,
class
Operator
>
struct
lhs_expression
{
T
lhs
;
explicit
lhs_expression
(
T
e
)
:
lhs
(
e
)
{}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
lhs_expression
&
self
)
{
std
::
string
op
=
Operator
::
as_string
();
if
(
not
op
.
empty
())
s
<<
Operator
::
as_string
()
<<
" "
;
s
<<
self
.
lhs
;
return
s
;
}
friend
decltype
(
auto
)
get_value
(
const
lhs_expression
&
e
)
{
return
e
.
value
();
}
decltype
(
auto
)
value
()
const
{
return
Operator
::
call
(
get_value
(
lhs
));
}
TEST_FOREACH_BINARY_OPERATORS
(
TEST_EXPR_BINARY_OPERATOR
)
TEST_FOREACH_UNARY_OPERATORS
(
TEST_EXPR_UNARY_OPERATOR
)
// NOLINTNEXTLINE
#define TEST_LHS_REOPERATOR(op) \
template <class U> \
auto operator op(const U& rhs) const \
{ \
return make_lhs_expression(lhs op rhs); \
}
TEST_LHS_REOPERATOR
(
+
)
TEST_LHS_REOPERATOR
(
-
)
TEST_LHS_REOPERATOR
(
*
)
TEST_LHS_REOPERATOR
(
/
)
TEST_LHS_REOPERATOR
(
%
)
TEST_LHS_REOPERATOR
(
&
)
TEST_LHS_REOPERATOR
(
|
)
TEST_LHS_REOPERATOR
(
^
)
};
template
<
class
F
>
struct
predicate
{
std
::
string
msg
;
F
f
;
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
s
,
const
predicate
&
self
)
{
s
<<
self
.
msg
;
return
s
;
}
decltype
(
auto
)
operator
()()
const
{
return
f
();
}
operator
decltype
(
auto
)()
const
{
return
f
();
}
};
template
<
class
F
>
auto
make_predicate
(
const
std
::
string
&
msg
,
F
f
)
{
return
make_lhs_expression
(
predicate
<
F
>
{
msg
,
f
},
function
{});
}
inline
std
::
string
as_string
(
bool
x
)
{
if
(
x
)
return
"true"
;
return
"false"
;
}
template
<
class
T
>
std
::
string
as_string
(
const
T
&
x
)
{
std
::
stringstream
ss
;
ss
<<
x
;
return
ss
.
str
();
}
template
<
class
Iterator
>
std
::
string
as_string
(
Iterator
start
,
Iterator
last
)
{
std
::
stringstream
ss
;
stream_range
(
ss
,
start
,
last
);
return
ss
.
str
();
}
template
<
class
F
>
auto
make_function
(
const
std
::
string
&
name
,
F
f
)
{
return
[
=
](
auto
&&
...
xs
)
{
std
::
vector
<
std
::
string
>
args
=
{
as_string
(
xs
)...};
return
make_predicate
(
name
+
"("
+
as_string
(
args
.
begin
(),
args
.
end
())
+
")"
,
[
=
]
{
return
f
(
xs
...);
});
};
}
struct
capture
{
template
<
class
T
>
auto
operator
->*
(
const
T
&
x
)
const
{
return
make_lhs_expression
(
x
);
}
template
<
class
T
,
class
Operator
>
auto
operator
->*
(
const
lhs_expression
<
T
,
Operator
>&
x
)
const
{
return
x
;
}
};
enum
class
color
{
reset
=
0
,
bold
=
1
,
underlined
=
4
,
fg_red
=
31
,
fg_green
=
32
,
fg_yellow
=
33
,
fg_blue
=
34
,
fg_default
=
39
,
bg_red
=
41
,
bg_green
=
42
,
bg_yellow
=
43
,
bg_blue
=
44
,
bg_default
=
49
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
color
&
c
)
{
#ifndef _WIN32
static
const
bool
use_color
=
isatty
(
STDOUT_FILENO
)
!=
0
;
if
(
use_color
)
return
os
<<
"
\033
["
<<
static_cast
<
std
::
size_t
>
(
c
)
<<
"m"
;
#else
(
void
)
c
;
#endif
return
os
;
}
inline
std
::
atomic
<
int
>&
failures
()
{
// NOLINTNEXTLINE
static
std
::
atomic
<
int
>
f
=
0
;
return
f
;
}
template
<
class
T
,
class
F
>
void
failed
(
T
x
,
const
char
*
msg
,
const
char
*
func
,
const
char
*
file
,
int
line
,
F
f
)
{
if
(
not
bool
(
x
.
value
()))
{
failures
()
++
;
std
::
cout
<<
func
<<
std
::
endl
;
std
::
cout
<<
file
<<
":"
<<
line
<<
":"
<<
std
::
endl
;
std
::
cout
<<
color
::
bold
<<
color
::
fg_red
<<
" FAILED: "
<<
color
::
reset
<<
msg
<<
" "
<<
"[ "
<<
x
<<
" ]"
<<
std
::
endl
;
f
();
}
}
template
<
class
F
>
bool
throws
(
F
f
)
{
try
{
f
();
return
false
;
}
catch
(...)
{
return
true
;
}
}
template
<
class
Exception
,
class
F
>
bool
throws
(
F
f
,
const
std
::
string
&
msg
=
""
)
{
try
{
f
();
return
false
;
}
catch
(
const
Exception
&
ex
)
{
return
std
::
string
(
ex
.
what
()).
find
(
msg
)
!=
std
::
string
::
npos
;
}
}
template
<
class
T
,
class
U
>
auto
within_abs
(
T
px
,
U
py
,
double
ptol
=
1e-6
f
)
{
return
make_function
(
"near"
,
[](
auto
x
,
auto
y
,
auto
tol
)
{
return
std
::
abs
(
x
-
y
)
<
tol
;
})(
px
,
py
,
ptol
);
}
// This implements the basic globbing algorithm where `*` matches any number
// of characters(including none) and `?` matches any single character. It
// doesnt support character classes.
//
// This is a simple recursive implementation that scans the string where the
// string and pattern matches. When a `*` is found in the pattern, the
// `glob_match` function is called recursively to compare the rest of the
// pattern to the rest of the string. If the recursive call returns true,
// then we have a match. However, if it returns false, then we advance one
// character and call the recusrsive call again. This is referred to as a
// star-loop, which will consume zero or more characters.
//
// This simple recursive implementation works well for short string and
// patterns with few stars. First, it is unlikely to use many stars to glob
// test names. Secondly, using many stars is still signficantly faster than
// using the equivalent std::regex, which has a much slower time complexity.
template
<
class
Iterator1
,
class
Iterator2
>
bool
glob_match
(
Iterator1
start
,
Iterator1
last
,
Iterator2
pattern_start
,
Iterator2
pattern_last
)
{
std
::
tie
(
start
,
pattern_start
)
=
std
::
mismatch
(
start
,
last
,
pattern_start
,
pattern_last
,
[](
auto
c
,
auto
m
)
{
if
(
m
==
'?'
)
return
true
;
// We need a loop for star, so bail and handle the loop below
if
(
m
==
'*'
)
return
false
;
return
c
==
m
;
});
// If there is no more pattern then return true if there is no more string to match
if
(
pattern_start
==
pattern_last
)
return
start
==
last
;
// If the pattern is not a star then its a mismatch
if
(
*
pattern_start
!=
'*'
)
return
false
;
// Multiple stars are the same as a single star so skip over multiple stars
pattern_start
=
std
::
find_if
(
pattern_start
,
pattern_last
,
[](
auto
c
)
{
return
c
!=
'*'
;
});
// If the star is at the end then return true
if
(
pattern_start
==
pattern_last
)
return
true
;
// star-loop: match the rest of the pattern and text
while
(
not
glob_match
(
start
,
last
,
pattern_start
,
pattern_last
)
and
start
!=
last
)
start
++
;
// If the string is empty then it means a match was never found
return
start
!=
last
;
}
using
string_map
=
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
;
template
<
class
Keyword
>
string_map
generic_parse
(
std
::
vector
<
std
::
string
>
as
,
Keyword
keyword
)
{
string_map
result
;
std
::
string
flag
;
for
(
auto
&&
x
:
as
)
{
auto
f
=
keyword
(
x
);
if
(
f
.
empty
())
{
result
[
flag
].
push_back
(
x
);
}
else
{
flag
=
f
.
front
();
result
[
flag
];
// Ensure the flag exists
flag
=
f
.
back
();
}
}
return
result
;
}
using
test_case
=
std
::
function
<
void
()
>
;
inline
auto
&
get_test_cases
()
{
// NOLINTNEXTLINE
static
std
::
vector
<
std
::
pair
<
std
::
string
,
test_case
>>
cases
;
return
cases
;
}
inline
void
add_test_case
(
std
::
string
name
,
test_case
f
)
{
get_test_cases
().
emplace_back
(
std
::
move
(
name
),
std
::
move
(
f
));
}
struct
auto_register_test_case
{
template
<
class
F
>
auto_register_test_case
(
const
char
*
name
,
F
f
)
noexcept
{
add_test_case
(
name
,
f
);
}
};
struct
failure_error
{
};
[[
noreturn
]]
inline
void
fail
()
{
throw
failure_error
{};
}
struct
driver
{
driver
()
{
add_flag
({
"--help"
,
"-h"
},
"Show help"
);
add_flag
({
"--list"
,
"-l"
},
"List all test cases"
);
add_flag
({
"--continue"
,
"-c"
},
"Continue after failure"
);
add_flag
({
"--quiet"
,
"-q"
},
"Don't print out extra output"
);
}
struct
argument
{
std
::
vector
<
std
::
string
>
flags
=
{};
std
::
string
help
=
""
;
int
nargs
=
1
;
};
void
add_arg
(
const
std
::
vector
<
std
::
string
>&
flags
,
const
std
::
string
&
help
=
""
)
{
arguments
.
push_back
(
argument
{
flags
,
help
,
1
});
}
void
add_flag
(
const
std
::
vector
<
std
::
string
>&
flags
,
const
std
::
string
&
help
=
""
)
{
arguments
.
push_back
(
argument
{
flags
,
help
,
0
});
}
static
void
wrap
(
std
::
ostream
&
os
,
const
std
::
string
&
text
,
const
std
::
string
&
prefix
=
""
,
unsigned
int
line_length
=
80
)
{
std
::
istringstream
iss
(
text
);
std
::
string
line
=
prefix
;
do
{
std
::
string
word
;
iss
>>
word
;
if
(
line
.
length
()
+
word
.
length
()
>
line_length
)
{
os
<<
line
<<
std
::
endl
;
line
=
prefix
;
}
line
+=
word
+
" "
;
}
while
(
iss
);
if
(
not
line
.
empty
())
os
<<
line
<<
std
::
endl
;
}
void
show_help
(
const
std
::
string
&
exe
)
const
{
const
std
::
string
prefix
=
" "
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
color
::
fg_yellow
<<
"USAGE:"
<<
color
::
reset
<<
std
::
endl
;
std
::
cout
<<
" "
;
std
::
cout
<<
exe
<<
" <test-case>... <options>"
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
color
::
fg_yellow
<<
"ARGS:"
<<
color
::
reset
<<
std
::
endl
;
std
::
cout
<<
" "
;
std
::
cout
<<
color
::
fg_green
<<
"<test-case>..."
<<
color
::
reset
;
std
::
cout
<<
std
::
endl
;
wrap
(
std
::
cout
,
"Test cases to run. A test case can be either the exact test case name or a glob. A "
"glob expression uses a '*' to select zero or more characters or a '?' to select any "
"single character."
,
prefix
+
prefix
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
color
::
fg_yellow
<<
"OPTIONS:"
<<
color
::
reset
<<
std
::
endl
;
for
(
auto
&&
arg
:
arguments
)
{
std
::
cout
<<
color
::
fg_green
;
std
::
string
arg_prefix
=
prefix
;
for
(
const
std
::
string
&
a
:
arg
.
flags
)
{
std
::
cout
<<
arg_prefix
;
std
::
cout
<<
a
;
arg_prefix
=
", "
;
}
std
::
cout
<<
color
::
reset
<<
std
::
endl
;
wrap
(
std
::
cout
,
arg
.
help
,
prefix
+
prefix
);
}
}
std
::
ostream
&
out
()
const
{
struct
null_buffer
:
std
::
streambuf
{
virtual
int
overflow
(
int
c
)
override
{
return
c
;
}
};
static
null_buffer
buffer
;
static
std
::
ostream
null_stream
(
&
buffer
);
if
(
quiet
)
return
null_stream
;
return
std
::
cout
;
}
string_map
parse
(
int
argc
,
const
char
*
argv
[])
const
{
std
::
vector
<
std
::
string
>
args
(
argv
+
1
,
argv
+
argc
);
string_map
keys
;
for
(
auto
&&
arg
:
arguments
)
{
for
(
auto
&&
flag
:
arg
.
flags
)
{
keys
[
flag
]
=
{
arg
.
flags
.
front
()};
if
(
arg
.
nargs
==
0
)
keys
[
flag
].
push_back
(
""
);
}
}
auto
result
=
generic_parse
(
args
,
[
&
](
auto
&&
s
)
->
std
::
vector
<
std
::
string
>
{
if
(
keys
.
count
(
s
)
>
0
)
return
keys
[
s
];
else
return
{};
});
result
[
"__exe__"
].
push_back
(
argv
[
0
]);
return
result
;
}
static
std
::
string
create_command
(
const
string_map
&
args
)
{
std
::
stringstream
ss
;
ss
<<
args
.
at
(
"__exe__"
).
front
();
if
(
args
.
count
(
""
)
>
0
)
{
for
(
auto
&&
arg
:
args
.
at
(
""
))
ss
<<
"
\"
"
<<
arg
<<
"
\"
"
;
}
for
(
auto
&&
p
:
args
)
{
if
(
p
.
first
==
"__exe__"
)
continue
;
if
(
p
.
first
.
empty
())
continue
;
ss
<<
" "
<<
p
.
first
;
for
(
auto
&&
arg
:
p
.
second
)
ss
<<
"
\"
"
<<
arg
<<
"
\"
"
;
}
return
ss
.
str
();
}
static
std
::
string
fork
(
const
std
::
string
&
name
,
string_map
args
)
{
std
::
string
msg
;
args
[
""
]
=
{
name
};
args
.
erase
(
"--continue"
);
args
[
"--quiet"
];
auto
cmd
=
create_command
(
args
);
auto
r
=
std
::
system
(
cmd
.
c_str
());
// NOLINT
if
(
r
!=
0
)
msg
=
"Exited with "
+
std
::
to_string
(
r
);
return
msg
;
}
static
std
::
vector
<
std
::
pair
<
std
::
string
,
test_case
>>
glob_tests
(
const
std
::
string
&
pattern
)
{
std
::
vector
<
std
::
pair
<
std
::
string
,
test_case
>>
result
;
std
::
copy_if
(
get_test_cases
().
begin
(),
get_test_cases
().
end
(),
std
::
back_inserter
(
result
),
[
&
](
auto
&&
p
)
{
return
glob_match
(
p
.
first
.
begin
(),
p
.
first
.
end
(),
pattern
.
begin
(),
pattern
.
end
());
});
return
result
;
}
void
run_test_case
(
const
std
::
string
&
name
,
const
test_case
&
f
,
const
string_map
&
args
)
{
ran
++
;
out
()
<<
color
::
fg_green
<<
"[ RUN ] "
<<
color
::
reset
<<
color
::
bold
<<
name
<<
color
::
reset
<<
std
::
endl
;
std
::
string
msg
;
auto
start
=
std
::
chrono
::
steady_clock
::
now
();
if
(
args
.
count
(
"--continue"
)
>
0
)
{
msg
=
fork
(
name
,
args
);
}
else
{
try
{
failures
()
=
0
;
f
();
}
// cppcheck-suppress migraphx-EmptyCatchStatement
catch
(
const
failure_error
&
)
{
}
}
auto
finish
=
std
::
chrono
::
steady_clock
::
now
();
auto
elapsed_ms
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
,
std
::
milli
>>
(
finish
-
start
)
.
count
();
if
(
msg
.
empty
()
and
failures
()
!=
0
)
{
if
(
failures
()
==
1
)
msg
=
"Test failure"
;
else
msg
=
std
::
to_string
(
failures
())
+
" test failures"
;
}
if
(
msg
.
empty
())
{
out
()
<<
color
::
fg_green
<<
"[ COMPLETE ] "
<<
color
::
reset
;
}
else
{
failed
.
push_back
(
name
);
out
()
<<
color
::
fg_red
<<
"[ FAILED ] "
<<
color
::
reset
;
}
out
()
<<
color
::
bold
<<
name
<<
color
::
reset
;
out
()
<<
color
::
fg_blue
<<
" ("
<<
elapsed_ms
<<
"ms)"
<<
color
::
reset
;
if
(
not
msg
.
empty
())
out
()
<<
": "
<<
color
::
fg_yellow
<<
msg
<<
color
::
reset
;
out
()
<<
std
::
endl
;
}
void
run
(
int
argc
,
const
char
*
argv
[])
{
auto
args
=
parse
(
argc
,
argv
);
if
(
args
.
count
(
"--help"
)
>
0
)
{
show_help
(
args
.
at
(
"__exe__"
).
front
());
return
;
}
if
(
args
.
count
(
"--list"
)
>
0
)
{
for
(
auto
&&
tc
:
get_test_cases
())
out
()
<<
tc
.
first
<<
std
::
endl
;
return
;
}
if
(
args
.
count
(
"--quiet"
)
>
0
)
quiet
=
true
;
auto
cases
=
args
[
""
];
if
(
cases
.
empty
())
{
for
(
auto
&&
tc
:
get_test_cases
())
run_test_case
(
tc
.
first
,
tc
.
second
,
args
);
}
else
{
std
::
unordered_map
<
std
::
string
,
test_case
>
m
(
get_test_cases
().
begin
(),
get_test_cases
().
end
());
for
(
auto
&&
iname
:
cases
)
{
std
::
vector
<
std
::
pair
<
std
::
string
,
test_case
>>
found_cases
;
for
(
auto
&&
pattern
:
get_case_names
(
iname
))
{
auto
f
=
m
.
find
(
pattern
);
if
(
f
==
m
.
end
())
{
found_cases
=
glob_tests
(
pattern
);
}
else
{
found_cases
.
push_back
(
*
f
);
}
}
if
(
found_cases
.
empty
())
{
out
()
<<
color
::
fg_red
<<
"[ ERROR ] Test case '"
<<
iname
<<
"' not found."
<<
color
::
reset
<<
std
::
endl
;
failed
.
push_back
(
iname
);
}
for
(
auto
&&
p
:
found_cases
)
run_test_case
(
p
.
first
,
p
.
second
,
args
);
}
}
out
()
<<
color
::
fg_green
<<
"[==========] "
<<
color
::
fg_yellow
<<
ran
<<
" tests ran"
<<
color
::
reset
<<
std
::
endl
;
if
(
not
failed
.
empty
())
{
out
()
<<
color
::
fg_red
<<
"[ FAILED ] "
<<
color
::
fg_yellow
<<
failed
.
size
()
<<
" tests failed"
<<
color
::
reset
<<
std
::
endl
;
for
(
auto
&&
name
:
failed
)
out
()
<<
color
::
fg_red
<<
"[ FAILED ] "
<<
color
::
fg_yellow
<<
name
<<
color
::
reset
<<
std
::
endl
;
std
::
exit
(
1
);
}
}
std
::
function
<
std
::
vector
<
std
::
string
>
(
const
std
::
string
&
)
>
get_case_names
=
[](
const
std
::
string
&
name
)
->
std
::
vector
<
std
::
string
>
{
return
{
name
};
};
std
::
vector
<
argument
>
arguments
=
{};
std
::
vector
<
std
::
string
>
failed
=
{};
std
::
size_t
ran
=
0
;
bool
quiet
=
false
;
};
inline
void
run
(
int
argc
,
const
char
*
argv
[])
{
driver
d
{};
d
.
run
(
argc
,
argv
);
}
}
// namespace test
// NOLINTNEXTLINE
#define TEST_CAPTURE(...) test::capture{}->*__VA_ARGS__
// NOLINTNEXTLINE
#define CHECK(...) \
test::failed( \
TEST_CAPTURE(__VA_ARGS__), #__VA_ARGS__, __PRETTY_FUNCTION__, __FILE__, __LINE__, [] {})
// NOLINTNEXTLINE
#define EXPECT(...) \
test::failed(TEST_CAPTURE(__VA_ARGS__), \
#__VA_ARGS__, \
__PRETTY_FUNCTION__, \
__FILE__, \
__LINE__, \
&test::fail)
// NOLINTNEXTLINE
#define STATUS(...) EXPECT((__VA_ARGS__) == 0)
// NOLINTNEXTLINE
#define TEST_CAT(x, ...) TEST_PRIMITIVE_CAT(x, __VA_ARGS__)
// NOLINTNEXTLINE
#define TEST_PRIMITIVE_CAT(x, ...) x##__VA_ARGS__
// NOLINTNEXTLINE
#define TEST_CASE_REGISTER(...) \
static test::auto_register_test_case TEST_CAT(register_test_case_, __LINE__) = \
test::auto_register_test_case(#__VA_ARGS__, &__VA_ARGS__);
// NOLINTNEXTLINE
#define TEST_CASE(...) \
void __VA_ARGS__(); \
TEST_CASE_REGISTER(__VA_ARGS__) \
void __VA_ARGS__()
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wglobal-constructors"
#endif
#endif
codegen/test/rtc/CMakeLists.txt
0 → 100644
View file @
10127959
find_package
(
hip
)
file
(
GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp
)
add_library
(
ck_rtc
${
RTC_SOURCES
}
)
target_include_directories
(
ck_rtc PUBLIC include
)
target_link_libraries
(
ck_rtc PUBLIC hip::host
)
codegen/test/rtc/include/rtc/compile_kernel.hpp
0 → 100644
View file @
10127959
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp>
#include <filesystem>
#include <string>
namespace
rtc
{
struct
src_file
{
std
::
filesystem
::
path
path
;
std
::
string_view
content
;
};
struct
compile_options
{
std
::
string
flags
=
""
;
std
::
string
kernel_name
=
"main"
;
};
kernel
compile_kernel
(
const
std
::
vector
<
src_file
>&
src
,
compile_options
options
=
compile_options
{});
}
// namespace rtc
#endif
codegen/test/rtc/include/rtc/hip.hpp
0 → 100644
View file @
10127959
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
namespace
rtc
{
template
<
class
T
>
struct
buffer
{
buffer
()
:
ptr
(),
n
(
0
)
{}
buffer
(
std
::
shared_ptr
<
T
>
p
,
std
::
size_t
sz
)
:
ptr
(
p
),
n
(
sz
)
{}
buffer
(
std
::
shared_ptr
<
void
>
p
,
std
::
size_t
sz
)
:
ptr
(
std
::
reinterpret_pointer_cast
<
T
>
(
p
)),
n
(
sz
)
{
}
explicit
buffer
(
std
::
size_t
sz
)
:
ptr
(
new
T
[
sz
]),
n
(
sz
)
{}
T
*
begin
()
{
return
data
();
}
T
*
end
()
{
return
data
()
+
size
();
}
const
T
*
begin
()
const
{
return
data
();
}
const
T
*
end
()
const
{
return
data
()
+
size
();
}
T
&
front
()
{
return
data
()[
0
];
}
T
&
back
()
{
return
data
()[
size
()
-
1
];
}
T
&
operator
[](
std
::
size_t
i
)
{
return
data
()[
i
];
}
T
&
at
(
std
::
size_t
i
)
{
if
(
i
>=
size
())
throw
std
::
runtime_error
(
"Out of bounds"
);
return
data
()[
i
];
}
const
T
&
front
()
const
{
return
data
()[
0
];
}
const
T
&
back
()
const
{
return
data
()[
size
()
-
1
];
}
const
T
&
operator
[](
std
::
size_t
i
)
const
{
return
data
()[
i
];
}
const
T
&
at
(
std
::
size_t
i
)
const
{
if
(
i
>=
size
())
throw
std
::
runtime_error
(
"Out of bounds"
);
return
data
()[
i
];
}
const
T
*
data
()
const
{
return
ptr
.
get
();
}
T
*
data
()
{
return
ptr
.
get
();
}
std
::
size_t
size
()
const
{
return
n
;
}
std
::
size_t
bytes
()
const
{
return
size
()
*
sizeof
(
T
);
}
bool
empty
()
const
{
return
size
()
==
0
;
}
private:
std
::
shared_ptr
<
T
>
ptr
;
std
::
size_t
n
;
};
std
::
string
get_device_name
();
std
::
string
hip_error
(
int
error
);
std
::
shared_ptr
<
void
>
allocate_gpu
(
std
::
size_t
sz
,
bool
host
=
false
);
std
::
shared_ptr
<
void
>
write_to_gpu
(
const
void
*
x
,
std
::
size_t
sz
,
bool
host
=
false
);
std
::
shared_ptr
<
void
>
read_from_gpu
(
const
void
*
x
,
std
::
size_t
sz
);
template
<
class
T
>
buffer
<
T
>
to_gpu
(
const
buffer
<
T
>&
input
)
{
return
{
write_to_gpu
(
input
.
data
(),
input
.
bytes
()),
input
.
size
()};
}
template
<
class
T
>
buffer
<
T
>
from_gpu
(
const
buffer
<
T
>&
input
)
{
return
{
read_from_gpu
(
input
.
data
(),
input
.
bytes
()),
input
.
size
()};
}
}
// namespace rtc
#endif
codegen/test/rtc/include/rtc/kernel.hpp
0 → 100644
View file @
10127959
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
#include <vector>
namespace
rtc
{
struct
kernel_argument
{
template
<
class
T
,
class
U
=
std
::
remove_reference_t
<
T
>,
class
=
std
::
enable_if_t
<
not
std
::
is_base_of
<
kernel_argument
,
T
>
{}
>>
kernel_argument
(
T
&&
x
)
:
size
(
sizeof
(
U
)),
align
(
alignof
(
U
)),
data
(
&
x
)
// NOLINT
{
}
std
::
size_t
size
;
std
::
size_t
align
;
void
*
data
;
};
std
::
vector
<
char
>
pack_args
(
const
std
::
vector
<
kernel_argument
>&
args
);
struct
kernel_impl
;
struct
kernel
{
kernel
()
=
default
;
kernel
(
const
char
*
image
,
const
std
::
string
&
name
);
template
<
class
T
>
kernel
(
const
std
::
vector
<
T
>&
image
,
const
std
::
string
&
name
)
:
kernel
(
reinterpret_cast
<
const
char
*>
(
image
.
data
()),
name
)
{
static_assert
(
sizeof
(
T
)
==
1
,
"Only byte types"
);
}
void
launch
(
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
,
const
std
::
vector
<
kernel_argument
>&
args
)
const
;
void
launch
(
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
,
std
::
vector
<
void
*>
args
)
const
;
template
<
class
...
Ts
>
auto
launch
(
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
,
Ts
...
zs
)
const
{
return
[
=
](
auto
&&
...
xs
)
{
launch
(
stream
,
global
,
local
,
std
::
vector
<
kernel_argument
>
{
xs
...},
zs
...);
};
}
private:
std
::
shared_ptr
<
kernel_impl
>
impl
;
};
}
// namespace rtc
#endif
codegen/test/rtc/include/rtc/manage_ptr.hpp
0 → 100644
View file @
10127959
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#include <type_traits>
#include <memory>
namespace
rtc
{
template
<
class
F
,
F
f
>
struct
manage_deleter
{
template
<
class
T
>
void
operator
()(
T
*
x
)
const
{
if
(
x
!=
nullptr
)
{
(
void
)
f
(
x
);
}
}
};
struct
null_deleter
{
template
<
class
T
>
void
operator
()(
T
*
)
const
{
}
};
template
<
class
T
,
class
F
,
F
f
>
using
manage_ptr
=
std
::
unique_ptr
<
T
,
manage_deleter
<
F
,
f
>>
;
template
<
class
T
>
struct
element_type
{
using
type
=
typename
T
::
element_type
;
};
template
<
class
T
>
using
remove_ptr
=
typename
std
::
conditional_t
<
std
::
is_pointer
<
T
>
{},
std
::
remove_pointer
<
T
>
,
element_type
<
T
>>::
type
;
template
<
class
T
>
using
shared
=
std
::
shared_ptr
<
remove_ptr
<
T
>>
;
template
<
class
T
>
shared
<
T
>
share
(
T
p
)
{
return
shared
<
T
>
{
std
::
move
(
p
)};
}
#define RTC_MANAGE_PTR(T, F) rtc::manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F>
}
// namespace rtc
#endif
codegen/test/rtc/include/rtc/tmp_dir.hpp
0 → 100644
View file @
10127959
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <filesystem>
namespace
rtc
{
struct
tmp_dir
{
std
::
filesystem
::
path
path
;
tmp_dir
(
const
std
::
string
&
prefix
=
""
);
void
execute
(
const
std
::
string
&
cmd
)
const
;
tmp_dir
(
tmp_dir
const
&
)
=
delete
;
tmp_dir
&
operator
=
(
tmp_dir
const
&
)
=
delete
;
~
tmp_dir
();
};
}
// namespace rtc
#endif
codegen/test/rtc/src/compile_kernel.cpp
0 → 100644
View file @
10127959
#include "rtc/hip.hpp"
#include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp>
#include <stdexcept>
#include <iostream>
#include <fstream>
#include <cassert>
namespace
rtc
{
template
<
class
T
>
T
generic_read_file
(
const
std
::
string
&
filename
,
size_t
offset
=
0
,
size_t
nbytes
=
0
)
{
std
::
ifstream
is
(
filename
,
std
::
ios
::
binary
|
std
::
ios
::
ate
);
if
(
nbytes
==
0
)
{
// if there is a non-zero offset and nbytes is not set,
// calculate size of remaining bytes to read
nbytes
=
is
.
tellg
();
if
(
offset
>
nbytes
)
throw
std
::
runtime_error
(
"offset is larger than file size"
);
nbytes
-=
offset
;
}
if
(
nbytes
<
1
)
throw
std
::
runtime_error
(
"Invalid size for: "
+
filename
);
is
.
seekg
(
offset
,
std
::
ios
::
beg
);
T
buffer
(
nbytes
,
0
);
if
(
not
is
.
read
(
&
buffer
[
0
],
nbytes
))
throw
std
::
runtime_error
(
"Error reading file: "
+
filename
);
return
buffer
;
}
std
::
vector
<
char
>
read_buffer
(
const
std
::
string
&
filename
,
size_t
offset
=
0
,
size_t
nbytes
=
0
)
{
return
generic_read_file
<
std
::
vector
<
char
>>
(
filename
,
offset
,
nbytes
);
}
std
::
string
read_string
(
const
std
::
string
&
filename
)
{
return
generic_read_file
<
std
::
string
>
(
filename
);
}
void
write_buffer
(
const
std
::
string
&
filename
,
const
char
*
buffer
,
std
::
size_t
size
)
{
std
::
ofstream
os
(
filename
);
os
.
write
(
buffer
,
size
);
}
void
write_buffer
(
const
std
::
string
&
filename
,
const
std
::
vector
<
char
>&
buffer
)
{
write_buffer
(
filename
,
buffer
.
data
(),
buffer
.
size
());
}
void
write_string
(
const
std
::
string
&
filename
,
const
std
::
string_view
&
buffer
)
{
write_buffer
(
filename
,
buffer
.
data
(),
buffer
.
size
());
}
std
::
string
compiler
()
{
return
"/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"
;
}
kernel
compile_kernel
(
const
std
::
vector
<
src_file
>&
srcs
,
compile_options
options
)
{
assert
(
not
srcs
.
empty
());
tmp_dir
td
{
"compile"
};
options
.
flags
+=
" -I. -O3"
;
options
.
flags
+=
" -std=c++17"
;
options
.
flags
+=
" --offload-arch="
+
get_device_name
();
std
::
string
out
;
for
(
const
auto
&
src
:
srcs
)
{
std
::
filesystem
::
path
full_path
=
td
.
path
/
src
.
path
;
std
::
filesystem
::
path
parent_path
=
full_path
.
parent_path
();
std
::
filesystem
::
create_directories
(
parent_path
);
write_string
(
full_path
.
string
(),
src
.
content
);
if
(
src
.
path
.
extension
().
string
()
==
".cpp"
)
{
options
.
flags
+=
" -c "
+
src
.
path
.
filename
().
string
();
if
(
out
.
empty
())
out
=
src
.
path
.
stem
().
string
()
+
".o"
;
}
}
options
.
flags
+=
" -o "
+
out
;
td
.
execute
(
compiler
()
+
options
.
flags
);
auto
out_path
=
td
.
path
/
out
;
if
(
not
std
::
filesystem
::
exists
(
out_path
))
throw
std
::
runtime_error
(
"Output file missing: "
+
out
);
auto
obj
=
read_buffer
(
out_path
.
string
());
return
kernel
{
obj
.
data
(),
options
.
kernel_name
};
}
}
// namespace rtc
codegen/test/rtc/src/hip.cpp
0 → 100644
View file @
10127959
#include <rtc/hip.hpp>
#include <rtc/manage_ptr.hpp>
#include <stdexcept>
#include <cassert>
namespace
rtc
{
using
hip_ptr
=
RTC_MANAGE_PTR
(
void
,
hipFree
);
std
::
string
hip_error
(
int
error
)
{
return
hipGetErrorString
(
static_cast
<
hipError_t
>
(
error
));
}
int
get_device_id
()
{
int
device
;
auto
status
=
hipGetDevice
(
&
device
);
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"No device"
);
return
device
;
}
std
::
string
get_device_name
()
{
hipDeviceProp_t
props
{};
auto
status
=
hipGetDeviceProperties
(
&
props
,
get_device_id
());
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Failed to get device properties"
);
return
props
.
gcnArchName
;
}
bool
is_device_ptr
(
const
void
*
ptr
)
{
hipPointerAttribute_t
attr
;
auto
status
=
hipPointerGetAttributes
(
&
attr
,
ptr
);
if
(
status
!=
hipSuccess
)
return
false
;
return
attr
.
type
==
hipMemoryTypeDevice
;
}
void
gpu_sync
()
{
auto
status
=
hipDeviceSynchronize
();
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"hip device synchronization failed: "
+
hip_error
(
status
));
}
std
::
size_t
get_available_gpu_memory
()
{
size_t
free
;
size_t
total
;
auto
status
=
hipMemGetInfo
(
&
free
,
&
total
);
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Failed getting available memory: "
+
hip_error
(
status
));
return
free
;
}
std
::
shared_ptr
<
void
>
allocate_gpu
(
std
::
size_t
sz
,
bool
host
)
{
if
(
sz
>
get_available_gpu_memory
())
throw
std
::
runtime_error
(
"Memory not available to allocate buffer: "
+
std
::
to_string
(
sz
));
void
*
alloc_ptr
=
nullptr
;
auto
status
=
host
?
hipHostMalloc
(
&
alloc_ptr
,
sz
)
:
hipMalloc
(
&
alloc_ptr
,
sz
);
if
(
status
!=
hipSuccess
)
{
if
(
host
)
throw
std
::
runtime_error
(
"Gpu allocation failed: "
+
hip_error
(
status
));
else
return
allocate_gpu
(
sz
,
true
);
}
assert
(
alloc_ptr
!=
nullptr
);
std
::
shared_ptr
<
void
>
result
=
share
(
hip_ptr
{
alloc_ptr
});
return
result
;
}
std
::
shared_ptr
<
void
>
write_to_gpu
(
const
void
*
x
,
std
::
size_t
sz
,
bool
host
)
{
gpu_sync
();
auto
result
=
allocate_gpu
(
sz
,
host
);
assert
(
is_device_ptr
(
result
.
get
()));
assert
(
not
is_device_ptr
(
x
));
auto
status
=
hipMemcpy
(
result
.
get
(),
x
,
sz
,
hipMemcpyHostToDevice
);
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Copy to gpu failed: "
+
hip_error
(
status
));
return
result
;
}
std
::
shared_ptr
<
void
>
read_from_gpu
(
const
void
*
x
,
std
::
size_t
sz
)
{
gpu_sync
();
std
::
shared_ptr
<
char
>
result
(
new
char
[
sz
]);
assert
(
not
is_device_ptr
(
result
.
get
()));
if
(
not
is_device_ptr
(
x
))
{
throw
std
::
runtime_error
(
"read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed
\n
"
);
}
auto
status
=
hipMemcpy
(
result
.
get
(),
x
,
sz
,
hipMemcpyDeviceToHost
);
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Copy from gpu failed: "
+
hip_error
(
status
));
// NOLINT
return
std
::
static_pointer_cast
<
void
>
(
result
);
}
}
// namespace rtc
codegen/test/rtc/src/kernel.cpp
0 → 100644
View file @
10127959
#include <rtc/kernel.hpp>
#include <rtc/manage_ptr.hpp>
#include <rtc/hip.hpp>
#include <cassert>
// extern declare the function since hip/hip_ext.h header is broken
extern
hipError_t
hipExtModuleLaunchKernel
(
hipFunction_t
,
// NOLINT
uint32_t
,
uint32_t
,
uint32_t
,
uint32_t
,
uint32_t
,
uint32_t
,
size_t
,
hipStream_t
,
void
**
,
void
**
,
hipEvent_t
=
nullptr
,
hipEvent_t
=
nullptr
,
uint32_t
=
0
);
namespace
rtc
{
std
::
vector
<
char
>
pack_args
(
const
std
::
vector
<
kernel_argument
>&
args
)
{
std
::
vector
<
char
>
kernargs
;
for
(
auto
&&
arg
:
args
)
{
std
::
size_t
n
=
arg
.
size
;
const
auto
*
p
=
static_cast
<
const
char
*>
(
arg
.
data
);
// Insert padding
std
::
size_t
padding
=
(
arg
.
align
-
(
kernargs
.
size
()
%
arg
.
align
))
%
arg
.
align
;
kernargs
.
insert
(
kernargs
.
end
(),
padding
,
0
);
kernargs
.
insert
(
kernargs
.
end
(),
p
,
p
+
n
);
}
return
kernargs
;
}
using
hip_module_ptr
=
RTC_MANAGE_PTR
(
hipModule_t
,
hipModuleUnload
);
struct
kernel_impl
{
hip_module_ptr
module
=
nullptr
;
hipFunction_t
fun
=
nullptr
;
};
hip_module_ptr
load_module
(
const
char
*
image
)
{
hipModule_t
raw_m
;
auto
status
=
hipModuleLoadData
(
&
raw_m
,
image
);
hip_module_ptr
m
{
raw_m
};
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Failed to load module: "
+
hip_error
(
status
));
return
m
;
}
kernel
::
kernel
(
const
char
*
image
,
const
std
::
string
&
name
)
:
impl
(
std
::
make_shared
<
kernel_impl
>
())
{
impl
->
module
=
load_module
(
image
);
auto
status
=
hipModuleGetFunction
(
&
impl
->
fun
,
impl
->
module
.
get
(),
name
.
c_str
());
if
(
hipSuccess
!=
status
)
throw
std
::
runtime_error
(
"Failed to get function: "
+
name
+
": "
+
hip_error
(
status
));
}
void
launch_kernel
(
hipFunction_t
fun
,
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
,
void
*
kernargs
,
std
::
size_t
size
)
{
assert
(
global
>
0
);
assert
(
local
>
0
);
void
*
config
[]
=
{
HIP_LAUNCH_PARAM_BUFFER_POINTER
,
kernargs
,
HIP_LAUNCH_PARAM_BUFFER_SIZE
,
&
size
,
HIP_LAUNCH_PARAM_END
};
auto
status
=
hipExtModuleLaunchKernel
(
fun
,
global
,
1
,
1
,
local
,
1
,
1
,
0
,
stream
,
nullptr
,
reinterpret_cast
<
void
**>
(
&
config
),
nullptr
,
nullptr
);
if
(
status
!=
hipSuccess
)
throw
std
::
runtime_error
(
"Failed to launch kernel: "
+
hip_error
(
status
));
}
void
kernel
::
launch
(
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
,
std
::
vector
<
void
*>
args
)
const
{
assert
(
impl
!=
nullptr
);
void
*
kernargs
=
args
.
data
();
std
::
size_t
size
=
args
.
size
()
*
sizeof
(
void
*
);
launch_kernel
(
impl
->
fun
,
stream
,
global
,
local
,
kernargs
,
size
);
}
void
kernel
::
launch
(
hipStream_t
stream
,
std
::
size_t
global
,
std
::
size_t
local
,
const
std
::
vector
<
kernel_argument
>&
args
)
const
{
assert
(
impl
!=
nullptr
);
std
::
vector
<
char
>
kernargs
=
pack_args
(
args
);
std
::
size_t
size
=
kernargs
.
size
();
launch_kernel
(
impl
->
fun
,
stream
,
global
,
local
,
kernargs
.
data
(),
size
);
}
}
// namespace rtc
\ No newline at end of file
codegen/test/rtc/src/tmp_dir.cpp
0 → 100644
View file @
10127959
#include <rtc/tmp_dir.hpp>
#include <algorithm>
#include <random>
#include <thread>
#include <unistd.h>
namespace
rtc
{
std
::
string
random_string
(
std
::
string
::
size_type
length
)
{
static
const
std
::
string
&
chars
=
"0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
;
std
::
mt19937
rg
{
std
::
random_device
{}()};
std
::
uniform_int_distribution
<
std
::
string
::
size_type
>
pick
(
0
,
chars
.
length
()
-
1
);
std
::
string
str
(
length
,
0
);
std
::
generate
(
str
.
begin
(),
str
.
end
(),
[
&
]
{
return
chars
[
pick
(
rg
)];
});
return
str
;
}
std
::
string
unique_string
(
const
std
::
string
&
prefix
)
{
auto
pid
=
getpid
();
auto
tid
=
std
::
this_thread
::
get_id
();
auto
clk
=
std
::
chrono
::
steady_clock
::
now
().
time_since_epoch
().
count
();
std
::
stringstream
ss
;
ss
<<
std
::
hex
<<
prefix
<<
"-"
<<
pid
<<
"-"
<<
tid
<<
"-"
<<
clk
<<
"-"
<<
random_string
(
16
);
return
ss
.
str
();
}
tmp_dir
::
tmp_dir
(
const
std
::
string
&
prefix
)
:
path
(
std
::
filesystem
::
temp_directory_path
()
/
unique_string
(
prefix
.
empty
()
?
"ck-rtc"
:
"ck-rtc-"
+
prefix
))
{
std
::
filesystem
::
create_directories
(
this
->
path
);
}
void
tmp_dir
::
execute
(
const
std
::
string
&
cmd
)
const
{
std
::
string
s
=
"cd "
+
path
.
string
()
+
"; "
+
cmd
;
std
::
system
(
s
.
c_str
());
}
tmp_dir
::~
tmp_dir
()
{
std
::
filesystem
::
remove_all
(
this
->
path
);
}
}
// namespace rtc
\ No newline at end of file
docs/dockerhub.rst
View file @
10127959
...
...
@@ -36,7 +36,7 @@ What is inside the image?
The docker images have everything you need for running CK including:
* `ROCm <https://
www
.amd.com/en/
graphics/servers-solutions-rocm
>`_
* `ROCm <https://
rocm.docs
.amd.com/en/
latest/index.html
>`_
* `CMake <https://cmake.org/getting-started/>`_
* `Compiler <https://github.com/ROCm/llvm-project>`_
* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
...
...
docs/sphinx/requirements.in
View file @
10127959
rocm-docs-core==0.35.
0
rocm-docs-core==0.35.
1
sphinxcontrib-bibtex==2.6.2
docs/sphinx/requirements.txt
View file @
10127959
...
...
@@ -113,7 +113,7 @@ requests==2.31.0
# via
# pygithub
# sphinx
rocm-docs-core==0.35.
0
rocm-docs-core==0.35.
1
# via -r requirements.in
six==1.16.0
# via
...
...
example/01_gemm/gemm_xdl_fp8.cpp
View file @
10127959
...
...
@@ -20,14 +20,18 @@ using BElementOp = PassThrough;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
LoopSched
=
ck
::
make_default_loop_scheduler
();
static
constexpr
auto
PipelineVer
=
ck
::
PipelineVersion
::
v1
;
using
ComputeTypeA
=
ck
::
f8_t
;
using
ComputeTypeB
=
ck
::
f8_t
;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
8
>
;
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
Loop| Pipeline| Compute| Compute|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
Scheduler| Version| TypeA| TypeB|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
| | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | |
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
8
,
LoopSched
,
PipelineVer
,
ComputeTypeA
,
ComputeTypeB
>
;
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
...
...
example/01_gemm/gemm_xdl_fp8_bf8.cpp
View file @
10127959
...
...
@@ -27,10 +27,10 @@ using ComputeTypeB = ck::bf8_t;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
Loop| Pipeline| Compute| Compute|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
Scheduler| Version| TypeA| TypeB|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
| | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | |
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
8
,
LoopSched
,
PipelineVer
,
ComputeTypeA
,
ComputeTypeB
>
;
// clang-format on
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
View file @
10127959
...
...
@@ -498,94 +498,95 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
}
};
static
bool
IsSupported
Argument
(
const
Argument
&
arg
)
static
constexpr
bool
IsSupported
(
index_t
MRaw_
,
index_t
NRaw_
,
index_t
KRaw_
)
{
if
(
!
ck
::
is_xdl_supported
())
{
return
false
;
}
// check vector load/store
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// check vector load of A
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
ABlockTransferSrcVectorDim
==
2
)
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// check vector load of A
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
ABlockTransferSrcVectorDim
==
2
)
{
if
(
arg
.
KRaw_
%
ABlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
ABlockTransferSrcVectorDim
==
1
)
{
// FIXME: not rigorous
if
(
arg
.
MRaw_
%
ABlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
if
(
KRaw_
%
ABlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
// check vector laod of B
if
constexpr
(
is_same_v
<
BLayout
,
Col
>
&&
BBlockTransferSrcVectorDim
==
2
)
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
ABlockTransferSrcVectorDim
==
1
)
{
// FIXME: not rigorous
if
(
MRaw_
%
ABlockTransferSrcScalarPerVector
!=
0
)
{
if
(
arg
.
KRaw_
%
BBlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
return
false
;
}
else
if
constexpr
(
is_same_v
<
BLayout
,
Row
>
&&
BBlockTransferSrcVectorDim
==
1
)
}
else
{
return
false
;
}
// check vector laod of B
if
constexpr
(
is_same_v
<
BLayout
,
Col
>
&&
BBlockTransferSrcVectorDim
==
2
)
{
if
(
KRaw_
%
BBlockTransferSrcScalarPerVector
!=
0
)
{
// FIXME: not rigorous
if
(
arg
.
NRaw_
%
BBlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
return
false
;
}
else
}
else
if
constexpr
(
is_same_v
<
BLayout
,
Row
>
&&
BBlockTransferSrcVectorDim
==
1
)
{
// FIXME: not rigorous
if
(
NRaw_
%
BBlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
// check vector load of Ds
// only support RowMajor for now
bool
all_valid
=
true
;
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
// check vector load of Ds
// only support RowMajor for now
bool
all_valid
=
true
;
if
constexpr
(
!
is_same_v
<
DLayout
,
Row
>
)
{
all_valid
=
false
;
}
});
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
if
(
!
all_valid
)
if
constexpr
(
!
is_same_v
<
DLayout
,
Row
>
)
{
return
false
;
all_valid
=
false
;
}
});
// check vector store of E
// only support RowMajor for now
if
constexpr
(
is_same_v
<
ELayout
,
Row
>
)
{
if
(
arg
.
NRaw_
%
CDEBlockTransferScalarPerVector_NPerBlock
!=
0
)
{
return
false
;
}
}
else
if
(
!
all_valid
)
{
return
false
;
}
// check vector store of E
// only support RowMajor for now
if
constexpr
(
is_same_v
<
ELayout
,
Row
>
)
{
if
(
NRaw_
%
CDEBlockTransferScalarPerVector_NPerBlock
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
return
true
;
}
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_
,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
if
(
!
ck
::
is_xdl_supported
())
{
return
false
;
}
return
IsSupported
(
arg
.
MRaw_
,
arg
.
NRaw_
,
arg
.
KRaw_
)
and
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_
,
arg
.
b_grid_desc_n_k_
,
arg
.
ds_grid_desc_m_n_
,
arg
.
e_grid_desc_m_n_
,
...
...
@@ -708,6 +709,178 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return
str
.
str
();
}
template
<
class
ADesc
,
class
BDesc
,
class
DsDesc
,
class
EDesc
>
struct
Descriptor
{
static
constexpr
auto
ds_tuple
()
{
return
transform_tuples
(
[
&
](
auto
d
)
constexpr
{
return
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
d
);
},
DsDesc
{});
}
using
AGridDesc_M_K
=
remove_cvref_t
<
decltype
(
DeviceOp
::
matrix_padder
.
PadADescriptor_M_K
(
ADesc
{}))
>
;
using
BGridDesc_N_K
=
remove_cvref_t
<
decltype
(
DeviceOp
::
matrix_padder
.
PadBDescriptor_N_K
(
BDesc
{}))
>
;
using
DsGridDesc_M_N
=
remove_cvref_t
<
decltype
(
ds_tuple
())
>
;
using
EGridDesc_M_N
=
remove_cvref_t
<
decltype
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
EDesc
{}))
>
;
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultAGridDescriptor_AK0_M_AK1
(
DeviceOp
::
matrix_padder
.
PadADescriptor_M_K
(
ADesc
{})))
>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultBGridDescriptor_BK0_N_BK1
(
DeviceOp
::
matrix_padder
.
PadBDescriptor_N_K
(
BDesc
{})))
>
;
using
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
ds_tuple
()))
>
;
using
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
EDesc
{})))
>
;
using
Block2ETileMap
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
EDesc
{})))
>
;
// tensor descriptors for problem definiton
AGridDesc_M_K
a_grid_desc_m_k
;
BGridDesc_N_K
b_grid_desc_n_k
;
DsGridDesc_M_N
ds_grid_desc_m_n
;
EGridDesc_M_N
e_grid_desc_m_n
;
// tensor descriptors for block/thread-wise copy
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
;
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
;
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
e_grid_desc_mblock_mperblock_nblock_nperblock
;
// block-to-e-tile map
Block2ETileMap
block_2_etile_map
;
// element-wise op
AElementwiseOperation
a_element_op
;
BElementwiseOperation
b_element_op
;
CDEElementwiseOperation
cde_element_op
;
// for checking vector load/store
index_t
MRaw
;
index_t
NRaw
;
index_t
KRaw
;
bool
has_main_k_block_loop
=
true
;
constexpr
Descriptor
(
ADesc
a
,
BDesc
b
,
DsDesc
ds
,
EDesc
e
,
AElementwiseOperation
a_element_op_
,
BElementwiseOperation
b_element_op_
,
CDEElementwiseOperation
cde_element_op_
)
:
a_grid_desc_m_k
{
DeviceOp
::
matrix_padder
.
PadADescriptor_M_K
(
a
)},
b_grid_desc_n_k
{
DeviceOp
::
matrix_padder
.
PadBDescriptor_N_K
(
b
)},
ds_grid_desc_m_n
{
transform_tuples
(
[
&
](
auto
d
)
constexpr
{
return
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
d
);
},
ds
)},
e_grid_desc_m_n
{
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
e
)},
a_grid_desc_ak0_m_ak1
{
GridwiseGemm
::
MakeDefaultAGridDescriptor_AK0_M_AK1
(
a_grid_desc_m_k
)},
b_grid_desc_bk0_n_bk1
{
GridwiseGemm
::
MakeDefaultBGridDescriptor_BK0_N_BK1
(
b_grid_desc_n_k
)},
ds_grid_desc_mblock_mperblock_nblock_nperblock
{
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
transform_tuples
(
[
&
](
auto
d
)
constexpr
{
return
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
d
);
},
ds
))},
e_grid_desc_mblock_mperblock_nblock_nperblock
{
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
e_grid_desc_m_n
)},
block_2_etile_map
{
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
e_grid_desc_m_n
)},
has_main_k_block_loop
{
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
a_grid_desc_ak0_m_ak1
.
GetLength
(
I0
)
*
a_grid_desc_ak0_m_ak1
.
GetLength
(
I2
))},
a_element_op
{
a_element_op_
},
b_element_op
{
b_element_op_
},
cde_element_op
{
cde_element_op_
},
MRaw
{
e
.
GetLength
(
I0
)},
NRaw
{
e
.
GetLength
(
I1
)},
KRaw
{
a
.
GetLength
(
I1
)}
{
}
constexpr
bool
IsValid
()
const
{
return
GridwiseGemm
::
CheckValidity
(
a_grid_desc_m_k
,
b_grid_desc_n_k
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
,
block_2_etile_map
)
and
IsSupported
(
MRaw
,
NRaw
,
KRaw
);
}
constexpr
index_t
GetBlockSize
()
const
{
return
BlockSize
;
}
constexpr
index_t
GetGridSize
()
const
{
return
block_2_etile_map
.
CalculateGridSize
(
e_grid_desc_m_n
);
}
};
template
<
class
ADesc
,
class
BDesc
,
class
DsDesc
,
class
EDesc
>
static
constexpr
auto
make_descriptor
(
ADesc
a
,
BDesc
b
,
DsDesc
ds
,
EDesc
e
,
AElementwiseOperation
a_element_op
=
AElementwiseOperation
{},
BElementwiseOperation
b_element_op
=
BElementwiseOperation
{},
CDEElementwiseOperation
cde_element_op
=
CDEElementwiseOperation
{})
{
return
Descriptor
<
ADesc
,
BDesc
,
DsDesc
,
EDesc
>
(
a
,
b
,
ds
,
e
,
a_element_op
,
b_element_op
,
cde_element_op
);
}
template
<
class
Desc
,
class
DsPointer
>
__device__
static
void
Run
(
const
Desc
&
desc
,
const
ADataType
*
__restrict__
p_a_grid
,
const
BDataType
*
__restrict__
p_b_grid
,
DsPointer
p_ds_grid
,
EDataType
*
__restrict__
p_e_grid
)
{
__shared__
char
p_shared_block
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
assert
(
desc
.
IsValid
());
if
(
desc
.
has_main_k_block_loop
)
{
GridwiseGemm
::
template
Run
<
true
>(
p_a_grid
,
p_b_grid
,
p_ds_grid
,
p_e_grid
,
p_shared_block
,
desc
.
a_element_op
,
desc
.
b_element_op
,
desc
.
cde_element_op
,
desc
.
a_grid_desc_ak0_m_ak1
,
desc
.
b_grid_desc_bk0_n_bk1
,
desc
.
ds_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
e_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
block_2_etile_map
);
}
else
{
GridwiseGemm
::
template
Run
<
false
>(
p_a_grid
,
p_b_grid
,
p_ds_grid
,
p_e_grid
,
p_shared_block
,
desc
.
a_element_op
,
desc
.
b_element_op
,
desc
.
cde_element_op
,
desc
.
a_grid_desc_ak0_m_ak1
,
desc
.
b_grid_desc_bk0_n_bk1
,
desc
.
ds_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
e_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
block_2_etile_map
);
}
}
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
10127959
...
...
@@ -24,10 +24,10 @@ struct BlockToCTileMap_M00_N0_M01
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01
()
=
default
;
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
1
)
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
1
)
:
M01_
(
M01
),
underlying_map_
(
GetBlockToCTileMap
(
c_grid_desc_m_n
,
M01
))
{
}
...
...
@@ -51,8 +51,8 @@ struct BlockToCTileMap_M00_N0_M01
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
__host__
__device__
constexpr
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
DefaultValidCTileIndex
(
c_tile_idx
,
c_tile_dim
);
...
...
@@ -60,7 +60,7 @@ struct BlockToCTileMap_M00_N0_M01
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
...
...
@@ -120,18 +120,19 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
const
BlockToCTileMap_M00_N0_M01Adapt
&
)
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
BlockToCTileMap_M00_N0_M01Adapt
&&
)
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
&
__host__
__device__
const
expr
BlockToCTileMap_M00_N0_M01Adapt
(
const
BlockToCTileMap_M00_N0_M01Adapt
&
)
=
default
;
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
(
BlockToCTileMap_M00_N0_M01Adapt
&&
)
=
default
;
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
&
operator
=
(
const
BlockToCTileMap_M00_N0_M01Adapt
&
)
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
&
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
&
operator
=
(
BlockToCTileMap_M00_N0_M01Adapt
&&
)
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
index_t
M
,
index_t
N
,
index_t
M01
=
8
)
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
(
index_t
M
,
index_t
N
,
index_t
M01
=
8
)
:
M_
(
M
),
N_
(
N
),
M01_
(
M01
)
{
#if 0
...
...
@@ -142,8 +143,9 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
}
template
<
typename
CGridDesc_M_N
>
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
)
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
)
:
BlockToCTileMap_M00_N0_M01Adapt
(
c_grid_desc_m_n
.
GetLength
(
I0
),
c_grid_desc_m_n
.
GetLength
(
I1
),
M01
)
{
...
...
@@ -164,7 +166,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
}
template
<
typename
CGridDesc_M_N
>
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
...
...
@@ -237,8 +239,8 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
__host__
__device__
constexpr
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
{
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
...
...
@@ -616,7 +618,10 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
private:
index_t
M01_
;
...
...
@@ -674,7 +679,7 @@ struct BlockToCTileMap_M00_N00_M01_N01
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
...
...
@@ -786,7 +791,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
...
...
@@ -910,7 +915,7 @@ struct OffsettedBlockToCTileMap
}
template
<
typename
CGridDesc_M_N
>
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
return
block_to_ctile_map_
.
CheckValidity
(
c_grid_desc_m_n
);
}
...
...
@@ -967,7 +972,7 @@ struct BlockToCTileMap_3DGrid_KSplit
}
template
<
typename
CGridDesc_M_N
>
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
__host__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
View file @
10127959
...
...
@@ -264,7 +264,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
const
BGridDesc_N_K
&
b_grid_desc_n_k
,
const
DsGridDesc_M_N
&
ds_grid_desc_m_n
,
const
EGridDesc_M_N
&
e_grid_desc_m_n
,
const
Block2ETileMap
&
block_2_etile_map
)
const
Block2ETileMap
&
)
{
static_assert
((
MPerBlock
%
(
MPerXdl
*
MXdlPerWave
)
==
0
)
&&
(
NPerBlock
%
(
NXdlPerWave
*
NPerXdl
))
==
0
,
...
...
@@ -310,10 +310,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
}
// check block-to-E-tile
if
(
!
block_2_etile_map
.
CheckValidity
(
e_grid_desc_m_n
))
{
return
false
;
}
//
if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
//
{
//
return false;
//
}
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
// check tensor size: cannot be larger than 2GB each
...
...
include/ck/utility/type_convert.hpp
View file @
10127959
...
...
@@ -166,9 +166,6 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
{
constexpr
int
seed
=
1254739
;
uint32_t
rng
=
prand_generator
<
float
,
seed
>
(
reinterpret_cast
<
uintptr_t
>
(
&
x
),
x
);
float
max_fp8
=
240.0
f
;
if
(
!
std
::
isinf
(
x
))
x
=
x
>
max_fp8
?
max_fp8
:
(
x
<
-
max_fp8
?
-
max_fp8
:
x
);
#if defined(__gfx94__)
union
{
...
...
@@ -176,10 +173,15 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_sr_fp8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
const
float
max_fp8
=
240.0
f
;
// if x is not +/- infinity or nan
if
((
val
.
i32val
&
NumericUtils
<
float
>::
nan_mask
)
!=
NumericUtils
<
float
>::
Inf
)
// clip float value
val
.
fval
=
__builtin_amdgcn_fmed3f
(
val
.
fval
,
max_fp8
,
-
max_fp8
);
ival
=
__builtin_amdgcn_cvt_sr_fp8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
// little endian
#else
constexpr
bool
negative_zero_nan
=
true
;
...
...
@@ -223,10 +225,15 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
ival
=
__builtin_amdgcn_cvt_sr_bf8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
const
float
max_bf8
=
57344.0
f
;
// if x is not +/- infinity or nan
if
((
val
.
i32val
&
NumericUtils
<
float
>::
nan_mask
)
!=
NumericUtils
<
float
>::
Inf
)
// clip float value
val
.
fval
=
__builtin_amdgcn_fmed3f
(
val
.
fval
,
max_bf8
,
-
max_bf8
);
ival
=
__builtin_amdgcn_cvt_sr_bf8_f32
(
val
.
fval
,
rng
,
ival
,
0
);
// 0 pos
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
// little endian
#else
constexpr
bool
negative_zero_nan
=
true
;
...
...
@@ -265,9 +272,6 @@ __host__ __device__ constexpr Y f8_convert_rne(X x);
template
<
>
inline
__host__
__device__
f8_t
f8_convert_rne
<
f8_t
,
float
>
(
float
x
)
{
float
max_fp8
=
240.0
f
;
if
(
!
std
::
isinf
(
x
))
x
=
x
>
max_fp8
?
max_fp8
:
(
x
<
-
max_fp8
?
-
max_fp8
:
x
);
#if defined(__gfx94__)
union
{
...
...
@@ -275,8 +279,13 @@ inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
const
float
max_fp8
=
240.0
f
;
// if x is not +/- infinity or nan
if
((
val
.
i32val
&
NumericUtils
<
float
>::
nan_mask
)
!=
NumericUtils
<
float
>::
Inf
)
// clip float value
val
.
fval
=
__builtin_amdgcn_fmed3f
(
val
.
fval
,
max_fp8
,
-
max_fp8
);
ival
=
__builtin_amdgcn_cvt_pk_fp8_f32
(
val
.
fval
,
val
.
fval
,
ival
,
false
);
// false -> WORD0
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
...
...
@@ -320,8 +329,13 @@ inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
uint32_t
i32val
;
uint8_t
i8val
[
4
];
// not endian independent
}
val
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
val
.
fval
=
x
;
uint32_t
ival
=
0
;
const
float
max_bf8
=
57344.0
f
;
// if x is not +/- infinity or nan
if
((
val
.
i32val
&
NumericUtils
<
float
>::
nan_mask
)
!=
NumericUtils
<
float
>::
Inf
)
// clip float value
val
.
fval
=
__builtin_amdgcn_fmed3f
(
val
.
fval
,
max_bf8
,
-
max_bf8
);
ival
=
__builtin_amdgcn_cvt_pk_bf8_f32
(
val
.
fval
,
val
.
fval
,
ival
,
false
);
// false -> WORD0
val
.
i32val
=
ival
;
return
val
.
i8val
[
0
];
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment