Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
516bbdcb
Commit
516bbdcb
authored
Apr 03, 2023
by
Astha Rai
Browse files
initial push with templated dev op
parent
fde6d274
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
1128 additions
and
0 deletions
+1128
-0
python/AIT implementation/sample files/model_interface.cpp
python/AIT implementation/sample files/model_interface.cpp
+290
-0
python/AIT implementation/sample files/model_interface.h
python/AIT implementation/sample files/model_interface.h
+212
-0
python/AIT implementation/sample files/norm_common.py
python/AIT implementation/sample files/norm_common.py
+498
-0
python/AIT implementation/sample files/permute_common.py
python/AIT implementation/sample files/permute_common.py
+128
-0
No files found.
python/AIT implementation/sample files/model_interface.cpp
0 → 100644
View file @
516bbdcb
#include "model_interface.h"
#include <iostream>
#include <unordered_map>
#include "model-generated.h"
#include "model_container.h"
// Important: don't let exceptions escape the functions below.
// They can cause problems when -fvisibility=hidden. But more
// importantly, they can crash the program if they try to cross
// the language boundary into Python.
#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \
try
{
\
__VA_ARGS__
\
}
catch
(
const
std
::
exception
&
e
)
{
\
LOG
(
ERROR
)
<<
"Error: "
<<
e
.
what
();
\
return
AITemplateError
::
AITemplateFailure
;
\
}
catch
(...)
{
\
LOG
(
ERROR
)
<<
"Unknown exception occurred."
;
\
return
AITemplateError
::
AITemplateFailure
;
\
}
\
return
AITemplateError
::
AITemplateSuccess
;
#define RETURN_ERROR_IF_NULL(var) \
if
(
var
==
nullptr
)
{
\
LOG
(
ERROR
)
<<
"Variable "
<<
#
var
<<
" can't be null"
;
\
return
AITemplateError
::
AITemplateFailure
;
\
}
namespace
ait
{
namespace
{
class
DefaultAllocator
:
public
AITemplateAllocator
{
public:
void
*
Allocate
(
size_t
n_bytes
)
override
{
void
*
result
;
DEVICE_CHECK
(
DeviceMalloc
(
&
result
,
n_bytes
));
return
result
;
}
void
Free
(
void
*
ptr
)
override
{
DEVICE_CHECK
(
FreeDeviceMemory
(
ptr
));
}
};
class
TrackingAllocator
:
public
DefaultAllocator
{
public:
void
*
Allocate
(
size_t
n_bytes
)
override
{
auto
*
result
=
DefaultAllocator
::
Allocate
(
n_bytes
);
num_bytes_
+=
n_bytes
;
return
result
;
}
size_t
NumBytesAllocated
()
const
{
return
num_bytes_
;
}
private:
size_t
num_bytes_
=
0
;
};
DefaultAllocator
default_allocator
;
}
// namespace
}
// namespace ait
extern
"C"
{
AITemplateError
AITemplateModelContainerCreate
(
AITemplateModelHandle
*
ret
,
size_t
num_runtimes
,
AITemplateAllocator
*
allocator
)
{
if
(
num_runtimes
==
0
)
{
LOG
(
ERROR
)
<<
"num_runtimes must be positive, but got 0"
;
return
AITemplateError
::
AITemplateFailure
;
}
RETURN_ERROR_IF_NULL
(
ret
)
AITemplateAllocator
&
allocator_ref
=
allocator
==
nullptr
?
ait
::
default_allocator
:
*
allocator
;
CONVERT_EXCEPTION_TO_ERROR_CODE
({
auto
*
m
=
ait
::
CreateModelContainer
(
num_runtimes
,
allocator_ref
);
*
ret
=
reinterpret_cast
<
AITemplateModelHandle
>
(
m
);
})
}
AITemplateError
AITemplateModelContainerDelete
(
AITemplateModelHandle
handle
)
{
RETURN_ERROR_IF_NULL
(
handle
)
CONVERT_EXCEPTION_TO_ERROR_CODE
({
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
delete
m
;
});
}
AITemplateError
AITemplateModelContainerSetConstant
(
AITemplateModelHandle
handle
,
const
char
*
name
,
const
AITData
*
tensor
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
tensor
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
m
->
SetConstant
(
name
,
*
tensor
);
})
}
AITemplateError
AITemplateModelContainerRun
(
AITemplateModelHandle
handle
,
const
AITData
*
inputs
,
size_t
num_inputs
,
AITData
*
outputs
,
size_t
num_outputs
,
AITemplateStreamHandle
stream_handle
,
bool
sync
,
bool
graph_mode
,
int64_t
**
output_shapes_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
auto
stream
=
reinterpret_cast
<
ait
::
StreamType
>
(
stream_handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
m
->
Run
(
inputs
,
num_inputs
,
outputs
,
num_outputs
,
stream
,
sync
,
graph_mode
,
output_shapes_out
);
})
}
AITemplateError
AITemplateModelContainerRunWithOutputsOnHost
(
AITemplateModelHandle
handle
,
const
AITData
*
inputs
,
size_t
num_inputs
,
AITData
*
outputs
,
size_t
num_outputs
,
AITemplateStreamHandle
stream_handle
,
bool
graph_mode
,
int64_t
**
output_shapes_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
auto
stream
=
reinterpret_cast
<
ait
::
StreamType
>
(
stream_handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
m
->
RunWithOutputsOnHost
(
inputs
,
num_inputs
,
outputs
,
num_outputs
,
stream
,
graph_mode
,
output_shapes_out
);
})
}
AITemplateError
AITemplateModelContainerBenchmark
(
AITemplateModelHandle
handle
,
const
AITData
*
inputs
,
size_t
num_inputs
,
AITData
*
outputs
,
size_t
num_outputs
,
AITemplateStreamHandle
stream_handle
,
bool
graph_mode
,
size_t
count
,
size_t
num_threads
,
bool
use_unique_stream_per_thread
,
float
*
runtime_ms
,
int64_t
**
output_shapes_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
runtime_ms
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
auto
stream
=
reinterpret_cast
<
ait
::
StreamType
>
(
stream_handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
*
runtime_ms
=
m
->
Benchmark
(
inputs
,
num_inputs
,
outputs
,
num_outputs
,
stream
,
graph_mode
,
count
,
num_threads
,
use_unique_stream_per_thread
,
output_shapes_out
);
})
}
AITemplateError
AITemplateModelContainerGetNumInputs
(
AITemplateModelHandle
handle
,
size_t
*
num_inputs_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
num_inputs_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
*
num_inputs_out
=
m
->
NumInputs
();
})
}
AITemplateError
AITemplateModelContainerGetInputName
(
AITemplateModelHandle
handle
,
size_t
input_idx
,
const
char
**
input_name_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
input_name_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
(
{
*
input_name_out
=
m
->
InputName
(
input_idx
);
})
}
AITemplateError
AITemplateModelContainerGetNumOutputs
(
AITemplateModelHandle
handle
,
size_t
*
num_outputs_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
num_outputs_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
*
num_outputs_out
=
m
->
NumOutputs
();
})
}
AITemplateError
AITemplateModelContainerGetOutputName
(
AITemplateModelHandle
handle
,
size_t
output_idx
,
const
char
**
output_name_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
output_name_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
(
{
*
output_name_out
=
m
->
OutputName
(
output_idx
);
})
}
AITemplateError
AITemplateModelContainerGetMaximumOutputShape
(
AITemplateModelHandle
handle
,
size_t
output_idx
,
AITemplateParamShape
*
shape_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
shape_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
(
{
*
shape_out
=
m
->
MaxOutputShape
(
output_idx
);
})
}
AITemplateError
AITemplateModelContainerGetOutputDtype
(
AITemplateModelHandle
handle
,
size_t
output_idx
,
AITemplateDtype
*
dtype_out
)
{
RETURN_ERROR_IF_NULL
(
handle
)
RETURN_ERROR_IF_NULL
(
dtype_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
*
dtype_out
=
m
->
OutputDtype
(
output_idx
);
})
}
AITemplateError
AITemplateModelContainerGetNumRuntimes
(
AITemplateModelHandle
handle
,
size_t
*
num_runtimes_out
)
{
RETURN_ERROR_IF_NULL
(
num_runtimes_out
)
auto
*
m
=
reinterpret_cast
<
ait
::
ModelContainer
*>
(
handle
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
*
num_runtimes_out
=
m
->
GetNumRuntimes
();
})
}
AITemplateError
AITemplateAllocatorCreate
(
AITemplateAllocator
**
allocator_out
,
AITemplateAllocatorType
allocator_type
)
{
RETURN_ERROR_IF_NULL
(
allocator_out
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
switch
(
allocator_type
)
{
case
AITemplateAllocatorType
::
kDefault
:
*
allocator_out
=
new
ait
::
DefaultAllocator
();
break
;
case
AITemplateAllocatorType
::
kTracking
:
*
allocator_out
=
new
ait
::
TrackingAllocator
();
break
;
default:
throw
std
::
runtime_error
(
"Unrecognized allocator type"
);
}
});
}
AITemplateError
AITemplateAllocatorDelete
(
AITemplateAllocator
*
allocator
)
{
RETURN_ERROR_IF_NULL
(
allocator
);
delete
allocator
;
return
AITemplateError
::
AITemplateSuccess
;
}
AITemplateError
AITemplateTrackingAllocatorGetNumBytes
(
AITemplateAllocator
*
allocator
,
size_t
*
num_bytes_out
)
{
RETURN_ERROR_IF_NULL
(
allocator
);
RETURN_ERROR_IF_NULL
(
num_bytes_out
);
CONVERT_EXCEPTION_TO_ERROR_CODE
({
auto
*
tracking_allocator
=
dynamic_cast
<
ait
::
TrackingAllocator
*>
(
allocator
);
if
(
tracking_allocator
==
nullptr
)
{
throw
std
::
runtime_error
(
"Allocator was not a tracking allocator!"
);
}
*
num_bytes_out
=
tracking_allocator
->
NumBytesAllocated
();
});
}
}
// extern "C"
\ No newline at end of file
python/AIT implementation/sample files/model_interface.h
0 → 100644
View file @
516bbdcb
#pragma once
#include <stddef.h>
#include <stdint.h>
#include <numeric>
#include <stdexcept>
#include <utility>
#include <vector>
// We compile all models with -fvisibility=hidden. Any symbols that need to be
// exposed in the final shared library must be declared with AIT_EXPORT to make
// them visible.
#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
#define AIT_EXPORT __attribute__((__visibility__("default")))
#else
#ifdef _WIN32
#define AIT_EXPORT __declspec(dllexport)
#else
#define AIT_EXPORT
#endif
#endif
struct
AITemplateModelOpaque
{};
using
AITemplateModelHandle
=
AITemplateModelOpaque
*
;
enum
class
AITemplateError
:
int
{
AITemplateSuccess
=
0
,
AITemplateFailure
=
1
,
};
struct
AITemplateParamShape
{
AITemplateParamShape
()
:
shape_data
(
nullptr
),
size
(
0
)
{}
AITemplateParamShape
(
const
int64_t
*
shape_data_in
,
size_t
size_in
)
:
shape_data
(
shape_data_in
),
size
(
size_in
)
{}
const
int64_t
*
shape_data
;
size_t
size
;
size_t
Numel
()
const
{
return
std
::
accumulate
(
shape_data
,
shape_data
+
size
,
1
,
std
::
multiplies
<
int64_t
>
());
}
};
enum
class
AITemplateDtype
{
kUnset
=
0
,
kHalf
,
kFloat
,
kInt
,
kLong
,
kBool
,
};
struct
AITData
{
AITData
()
:
ptr
(
nullptr
),
dtype
(
AITemplateDtype
::
kUnset
)
{}
AITData
(
void
*
ptr_in
,
const
AITemplateParamShape
&
shape_in
,
AITemplateDtype
dtype_in
)
:
ptr
(
ptr_in
),
shape
(
shape_in
),
dtype
(
dtype_in
)
{}
void
*
ptr
;
AITemplateParamShape
shape
;
AITemplateDtype
dtype
;
};
inline
size_t
AITemplateDtypeSizeBytes
(
AITemplateDtype
dtype
)
{
switch
(
dtype
)
{
case
AITemplateDtype
::
kHalf
:
return
2
;
case
AITemplateDtype
::
kFloat
:
return
4
;
case
AITemplateDtype
::
kInt
:
return
4
;
case
AITemplateDtype
::
kLong
:
return
8
;
case
AITemplateDtype
::
kBool
:
return
1
;
case
AITemplateDtype
::
kUnset
:
throw
std
::
runtime_error
(
"Unset dtype has no size!"
);
}
}
struct
AITemplateStreamOpaque
{};
using
AITemplateStreamHandle
=
AITemplateStreamOpaque
*
;
// Allocator to use for GPU mallocs and frees. Allocations will only happen
// when the ModelContainer is created.
class
AITemplateAllocator
{
public:
virtual
void
*
Allocate
(
size_t
nbytes
)
=
0
;
virtual
void
Free
(
void
*
ptr
)
=
0
;
virtual
~
AITemplateAllocator
()
=
default
;
};
// Some custom allocators are provided. They can be created by passing
// an enum into the AITemplateAllocatorCreate() function.
enum
class
AITemplateAllocatorType
{
// The default allocator just uses the backend's default malloc/free.
kDefault
=
0
,
// The tracking allocator is like the default allocator, but it keeps
// track of how many bytes it has allocated. Mainly used for testing.
kTracking
,
};
extern
"C"
{
// Create a ModelContainer. See model_container.h for all the details.
// Some important high-level notes:
// * If allocator is null, a default allocator is used (forwards to
// {cuda/hip}{Malloc/Free}).
// * We assume that the allocator lives at least as long as the ModelContainer.
AIT_EXPORT
AITemplateError
AITemplateModelContainerCreate
(
AITemplateModelHandle
*
ret
,
size_t
num_runtimes
,
AITemplateAllocator
*
allocator
=
nullptr
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerDelete
(
AITemplateModelHandle
handle
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerSetConstant
(
AITemplateModelHandle
handle
,
const
char
*
name
,
const
AITData
*
tensor
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerRun
(
AITemplateModelHandle
handle
,
const
AITData
*
inputs
,
size_t
num_inputs
,
AITData
*
outputs
,
size_t
num_outputs
,
AITemplateStreamHandle
stream_handle
,
bool
sync
,
bool
graph_mode
,
int64_t
**
output_shapes_out
);
// Like AITemplateModelContainerRun, but expects outputs to be allocated on the
// host. Does an extra sync/copy at the end to copy them over. Warning: don't
// use this! It's not optimal with respect to performance. It's here for use by
// internal constant folding passes.
AIT_EXPORT
AITemplateError
AITemplateModelContainerRunWithOutputsOnHost
(
AITemplateModelHandle
handle
,
const
AITData
*
inputs
,
size_t
num_inputs
,
AITData
*
outputs
,
size_t
num_outputs
,
AITemplateStreamHandle
stream_handle
,
bool
graph_mode
,
int64_t
**
output_shapes_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerBenchmark
(
AITemplateModelHandle
handle
,
const
AITData
*
inputs
,
size_t
num_inputs
,
AITData
*
ouputs
,
size_t
num_outputs
,
AITemplateStreamHandle
stream_handle
,
bool
graph_mode
,
size_t
count
,
size_t
num_threads
,
bool
use_unique_stream_per_thread
,
float
*
runtime_ms
,
int64_t
**
output_shapes_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetNumInputs
(
AITemplateModelHandle
handle
,
size_t
*
num_inputs_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetInputName
(
AITemplateModelHandle
handle
,
size_t
input_idx
,
const
char
**
input_name_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetNumOutputs
(
AITemplateModelHandle
handle
,
size_t
*
num_outputs_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetOutputName
(
AITemplateModelHandle
handle
,
size_t
output_idx
,
const
char
**
output_name_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetMaximumOutputShape
(
AITemplateModelHandle
handle
,
size_t
output_idx
,
AITemplateParamShape
*
shape_out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetOutputDtype
(
AITemplateModelHandle
handle
,
size_t
output_idx
,
AITemplateDtype
*
out
);
AIT_EXPORT
AITemplateError
AITemplateModelContainerGetNumRuntimes
(
AITemplateModelHandle
handle
,
size_t
*
num_runtimes_out
);
AIT_EXPORT
AITemplateError
AITemplateAllocatorCreate
(
AITemplateAllocator
**
allocator_out
,
AITemplateAllocatorType
allocator_type
);
AIT_EXPORT
AITemplateError
AITemplateAllocatorDelete
(
AITemplateAllocator
*
allocator_out
);
// Get the number of bytes allocated; mainly used for testing.
AIT_EXPORT
AITemplateError
AITemplateTrackingAllocatorGetNumBytes
(
AITemplateAllocator
*
allocator
,
size_t
*
num_bytes_out
);
}
// extern "C"
\ No newline at end of file
python/AIT implementation/sample files/norm_common.py
0 → 100644
View file @
516bbdcb
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Normalization common codegen for ROCM.
"""
import
os
import
re
from
hashlib
import
sha1
from
typing
import
Any
,
Dict
,
OrderedDict
import
jinja2
from
...target
import
Target
FUNC_CALL_PARAM_TEMPLATE
=
jinja2
.
Template
(
"(void *)({{name}})"
)
INSTANCE_TEMPLATE
=
jinja2
.
Template
(
"""
{{config}}
using {{name}} = {{ config_name }};
"""
)
ARGS_PARSE_TEMPLATE
=
jinja2
.
Template
(
"""
{% for idx in range(rank) %}
const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
{% endfor %}
"""
)
STRUCTS_DEF_TEMPLATE
=
jinja2
.
Template
(
"""
struct ProfilerMemoryPool {
ProfilerMemoryPool() {
std::random_device rd;
gen = std::mt19937(rd());
uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
offsets.reserve(512);
strides.reserve(512);
copies.reserve(512);
ptrs.reserve(512);
}
~ProfilerMemoryPool() {
for(int i = 0; i < ptrs.size(); i++){
hipFree(ptrs[i]);
}
}
template <typename DType>
DType* AllocateGaussianTensor(int64_t size) {
size_t length = size * sizeof(DType);
DType *d_x;
hipMalloc(&d_x, length);
float mean = 0.0f;
float stddev = 1.0f;
uint64_t seed = uniform_dist(gen);
rocrand_set_seed(generator, seed);
rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
return d_x;
}
ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
return reinterpret_cast<ck::half_t*>(
AllocateGaussianTensor<ck::half_t>(size));
}
int AllocateHalfTensor(int64_t size, int64_t copy) {
offsets.push_back(0);
strides.push_back(size);
copies.push_back(copy);
auto ptr = AllocateHalfGaussianTensor(size * copy);
ptrs.push_back(reinterpret_cast<void*>(ptr));
return ptrs.size() - 1;
}
ck::half_t* RequestHalfTensorByIdx(int idx) {
auto copy = copies.at(idx);
auto offset = offsets.at(idx);
auto stride = strides.at(idx);
ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
ptr += offset;
offset += stride;
if (offset == copy * stride) {
offset = 0;
}
offsets[idx] = offset;
return ptr;
}
std::vector<int64_t> offsets;
std::vector<int64_t> strides;
std::vector<int64_t> copies;
std::vector<void*> ptrs;
std::mt19937 gen;
std::uniform_int_distribution<int64_t> uniform_dist;
rocrand_generator generator;
};
// hack for DeviceMem linking error
// TODO fix this by making CK a header-only lib
// <<< hack begin
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) const
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p) const
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl() {
hipGetErrorString(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd));
}
~KernelTimerImpl() {
hipGetErrorString(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd));
}
void Start() {
hipGetErrorString(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr));
}
void End() {
hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const {
float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
// >>> hack end
"""
)
PROFILER_TEMPLATE
=
jinja2
.
Template
(
"""
size_t GLOBAL_WORKSPACE_SIZE = 0;
{{op_func}}
{{structs_def}}
int main(int argc, char** argv) {
{{args_parse}}
auto memory_pool = std::make_unique<ProfilerMemoryPool>();
hipStream_t stream = nullptr;
{{tensor_decl}}
// warmup
for(int i = 0; i < 3; ++i) {
{{func_call}}
}
// run
KernelTimerImpl timer;
timer.Start();
for(int i = 0; i < 5; ++i) {
{{func_call}}
}
timer.End();
std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
}
"""
)
FUNC_TEMPLATE
=
jinja2
.
Template
(
"""
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <random>
#include <rocrand/rocrand.h>
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/reduction_operator.hpp"
{{extra_headers}}
{{extra_code}}
{{instances_decl}}
{{func_signature}}
{
{{shape_eval}}
{{exec_paths}}
}
"""
)
FUNC_CALL_TEMPLATE
=
jinja2
.
Template
(
"""
{{indent}}{{func_name}}(
{{indent}} {{input}},
{{indent}} {{output}},
{% for name in input_dim_names %}
{{indent}} const_cast<int64_t *>(&{{name}}),
{% endfor %}
{{indent}} stream
{{indent}});
"""
)
def
extract_config
(
func_attrs
):
"""Extract (operation name, operation instance) pair
from all operation candidates.
Parameters
----------
op_kind : ck_lib.library.OperationKind
Operation kind.
extra_kind : ck_lib.library.[AnyKind]
Used to as extra flag to distinguish kernels.
E.g. bias_add_relu vs. add_relu_bias
f_prop_op: function
Used to filter operation.
Returns
-------
Dict
Extracted (operation name, operation instance) pair.
"""
import
ck_lib
op_kind
=
ck_lib
.
library
.
OperationKind
.
Softmax
extra_kind
=
len
(
func_attrs
[
"inputs"
][
0
].
_attrs
[
"shape"
])
extract_ops
=
list
(
Target
.
current
().
_operators
[
op_kind
][
extra_kind
].
items
())
softmax_ops
=
OrderedDict
()
for
key
,
value
in
extract_ops
:
softmax_ops
[
key
]
=
value
[
0
]
func_attrs
[
"op_instance"
]
=
softmax_ops
def
emit_instance
(
op
):
"""Emit instance"""
import
ck_lib
# noqa: F401
op_def
=
op
.
emit
()
return
op_def
def
extract_config_name
(
config
):
"""Extract configuration names.
Parameters
----------
config : str
Configuration as a string in the format of 'using model = xxx'.
Returns
-------
str
Extracted name from the statement, e.g. 'model' for 'using model = xxx'.
Raises
------
RuntimeError
Invalid config.
"""
pattern
=
re
.
compile
(
r
"\s*using\s(.*?)\s="
)
decl
=
config
.
split
(
"
\n
"
)[
1
]
match
=
pattern
.
match
(
decl
)
if
match
is
None
:
raise
RuntimeError
(
"Invalid config:
\n
"
+
config
)
return
match
.
groups
()[
0
]
def
gen_profiler
(
func_attrs
:
Dict
[
str
,
Any
],
workdir
:
str
,
rank
:
int
,
shape_eval_template
:
jinja2
.
Template
,
exec_template
:
jinja2
.
Template
,
tensor_decl_template
:
jinja2
.
Template
,
extra_header_template
:
jinja2
.
Template
,
get_func_signature
:
Any
,
extra_code
:
str
=
""
,
func_call_template
:
jinja2
.
Template
=
FUNC_CALL_TEMPLATE
,
indent
:
str
=
" "
,
)
->
str
:
"""Generates standalone executables for profiler.
Parameters
----------
func_attrs : Dict
Operation attributes.
workdir : str
Directory to store the generated outputs.
rank: int
Rank of the input tensor. If using [M, N] in exec_key, the rank here
must be 2 because if implies that the inputs are reshaped for profiling.
For code gen, the real shapes are used.
exec_template : jinja2.Template
Execution block template.
tensor_decl_template: jinja2.Template
Tensor declaration template.
extra_header_template : jinja2.Template
Extra header template.
indent : str, optional
Indent for codegen, target dependent e.g. C++, python, etc., by default " ".
"""
op_type
=
func_attrs
[
"op"
]
shape_eval
=
shape_eval_template
.
render
(
rank
=
rank
)
if
shape_eval_template
else
""
eps
=
func_attrs
.
get
(
"eps"
,
"1e-5"
)
op_instance
=
func_attrs
[
"op_instance"
]
file_pairs
=
[]
for
op_name
,
op
in
op_instance
.
items
():
config
=
emit_instance
(
op
)
config_name
=
extract_config_name
(
config
)
instances
=
INSTANCE_TEMPLATE
.
render
(
name
=
"DeviceInstance"
,
config_name
=
config_name
,
config
=
config
)
exe_path
=
exec_template
.
render
(
instance
=
"DeviceInstance"
,
dtype
=
"void"
,
reduce_dims
=
rank
-
1
,
rank
=
rank
,
eps
=
eps
,
)
op_func
=
FUNC_TEMPLATE
.
render
(
instances_decl
=
instances
,
func_signature
=
get_func_signature
(
func_attrs
),
shape_eval
=
shape_eval
,
exec_paths
=
exe_path
,
extra_headers
=
extra_header_template
.
render
(),
extra_code
=
extra_code
,
)
structs_def
=
STRUCTS_DEF_TEMPLATE
.
render
()
args_parse
=
ARGS_PARSE_TEMPLATE
.
render
(
rank
=
rank
)
tensor_decl
=
tensor_decl_template
.
render
(
rank
=
rank
)
input_dim_names
=
[
f
"in_
{
i
}
"
for
i
in
range
(
rank
)]
func_call
=
func_call_template
.
render
(
func_name
=
func_attrs
[
"name"
],
input
=
"(void *) memory_pool->RequestHalfTensorByIdx(0)"
,
gamma
=
"(void *) memory_pool->RequestHalfTensorByIdx(2)"
,
beta
=
"(void *) memory_pool->RequestHalfTensorByIdx(3)"
,
output
=
"(void *) memory_pool->RequestHalfTensorByIdx(1)"
,
input_dim_names
=
input_dim_names
,
indent
=
indent
,
)
code
=
PROFILER_TEMPLATE
.
render
(
op_func
=
op_func
,
structs_def
=
structs_def
,
args_parse
=
args_parse
,
tensor_decl
=
tensor_decl
,
func_call
=
func_call
,
)
prefix
=
os
.
path
.
join
(
workdir
,
"profiler"
,
op_type
)
if
not
os
.
path
.
exists
(
prefix
):
os
.
makedirs
(
prefix
)
src_path
=
os
.
path
.
join
(
prefix
,
op_name
+
".cpp"
)
obj_path
=
os
.
path
.
join
(
prefix
,
op_name
)
if
os
.
path
.
exists
(
obj_path
):
continue
with
open
(
src_path
,
"w"
)
as
fo
:
fo
.
write
(
code
)
file_pairs
.
append
((
src_path
,
obj_path
))
return
file_pairs
# no longer used by layernorm
def
gen_function
(
func_attrs
:
Dict
[
str
,
Any
],
exec_template
:
jinja2
.
Template
,
extra_header_template
:
jinja2
.
Template
,
get_func_signature
:
Any
,
)
->
str
:
"""Generate function body.
Parameters
----------
func_attrs : Dict
Operation attributes.
exec_template : jinja2.Template
Execution block template.
extra_header_template : jinja2.Template
Extra header template.
Returns
-------
str
The rendered template of generated function body.
"""
shapes
=
func_attrs
[
"inputs"
][
0
].
_attrs
[
"shape"
]
rank
=
len
(
shapes
)
exec_path
=
func_attrs
[
"exec_path"
]
op_instance
=
func_attrs
[
"op_instance"
]
inst_def_flag
=
set
()
instances
=
{}
instance_decl
=
""
for
exec_item
in
exec_path
.
values
():
fname
=
"f"
+
sha1
(
exec_item
.
exec_cond
.
encode
()).
hexdigest
()
algo
=
exec_item
.
algo
if
algo
not
in
inst_def_flag
:
config
=
emit_instance
(
op_instance
[
algo
])
inst_def_flag
.
add
(
algo
)
else
:
config
=
""
inst
=
INSTANCE_TEMPLATE
.
render
(
config
=
config
,
name
=
fname
,
config_name
=
extract_config_name
(
config
)
)
instances
[
exec_item
.
exec_cond
]
=
inst
instance_decl
+=
inst
exec_cond_template
=
func_attrs
[
"exec_cond_template"
]
exec_paths
=
""
for
key
,
_
in
instances
.
items
():
fname
=
"f"
+
sha1
(
key
.
encode
()).
hexdigest
()
program
=
exec_template
.
render
(
instance
=
fname
,
dtype
=
"void"
,
reduce_dims
=
rank
-
1
,
rank
=
rank
)
cond_vars
=
re
.
findall
(
r
"\S+(?= >=)"
,
key
)
cond_vars
+=
re
.
findall
(
r
"\S+(?= ==)"
,
key
)
cond
=
key
for
i
,
var
in
enumerate
(
cond_vars
):
cond
=
cond
.
replace
(
var
+
" "
,
"*in_"
+
str
(
i
))
exec_inst
=
exec_cond_template
.
render
(
indent
=
" "
,
cond
=
cond
,
program
=
program
)
exec_paths
+=
exec_inst
return
FUNC_TEMPLATE
.
render
(
instances_decl
=
instance_decl
,
func_signature
=
get_func_signature
(
func_attrs
),
exec_paths
=
exec_paths
,
extra_headers
=
extra_header_template
.
render
(),
)
def
gen_function_call
(
func_attrs
,
indent
=
" "
):
"""Generates function call.
Parameters
----------
func_attrs : Dict
Stores the operation attributes.
indent : str, optional
Indent for codegen, target dependent e.g. C++, python, etc., by default " ".
Returns
-------
str
The rendered template of generated function call.
"""
input_name
=
FUNC_CALL_PARAM_TEMPLATE
.
render
(
name
=
func_attrs
[
"inputs"
][
0
].
_attrs
[
"name"
]
)
output_name
=
FUNC_CALL_PARAM_TEMPLATE
.
render
(
name
=
func_attrs
[
"outputs"
][
0
].
_attrs
[
"name"
]
)
shapes
=
func_attrs
[
"inputs"
][
0
].
_attrs
[
"shape"
]
input_dim_names
=
[
shape
.
_attrs
[
"name"
]
for
shape
in
shapes
]
return
FUNC_CALL_TEMPLATE
.
render
(
func_name
=
func_attrs
[
"name"
],
input
=
input_name
,
output
=
output_name
,
input_dim_names
=
input_dim_names
,
indent
=
indent
,
)
\ No newline at end of file
python/AIT implementation/sample files/permute_common.py
0 → 100644
View file @
516bbdcb
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import
jinja2
EXTRA_SHAPE_TEMPLATE
=
jinja2
.
Template
(
"""
{{indent}}const int64_t stride_a = *a_dim1;
{{indent}}const int64_t stride_b = *b_dim1;
{{indent}}const int64_t stride_c = *c_dim1;
ck::index_t M0 = M / G1 / G2;
ck::index_t M1 = G1;
ck::index_t M2 = G2;
ck::index_t N0 = G3;
ck::index_t N1 = N / G3;
// GEMM shape
//ck::index_t M = M0 * M1 * M2;
//ck::index_t N = N0 * N1;
//ck::index_t K = 128;
//ck::index_t stride_A = K;
//ck::index_t stride_B = K;
// E = [M0, N0, M1, N1, M2]
/* 0, 3, 1, 4, 2
ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
ck::index_t stride_E_M1 = N1 * M2;
ck::index_t stride_E_M2 = 1;
ck::index_t stride_E_N0 = M1 * N1 * M2;
ck::index_t stride_E_N1 = M2;
*/
// E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
ck::index_t stride_E_M0 = N0* M1* N1;
ck::index_t stride_E_M1 = N1;
ck::index_t stride_E_M2 = M0* N0* M1* N1;
ck::index_t stride_E_N0 = M1 * N1;
ck::index_t stride_E_N1 = 1;
// D = [0, N0, 0, N1, 0]
ck::index_t stride_D_M0 = 0;
ck::index_t stride_D_M1 = 0;
ck::index_t stride_D_M2 = 0;
ck::index_t stride_D_N0 = N1;
ck::index_t stride_D_N1 = 1;
"""
)
EXTRA_SHAPE_TEMPLATE_M2N3
=
jinja2
.
Template
(
"""
const int64_t G1 = p_dim0; // G1
const int64_t G2 = p_dim1; // G2
const int64_t G3 = p_dim2; // G3
ck::index_t M0 = M / G1;
ck::index_t M1 = G1;
ck::index_t N0 = G2;
ck::index_t N1 = G3;
ck::index_t N2 = N / G2 / G3;
ck::index_t K0 = K;
ck::index_t G = 1;
// A[G, M0, M1, M2, K0]
std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, K0};
std::vector<ck::index_t> a_ms_ks_strides{M0*M1*K0, M1 * K0, K0, 1};
// B[G, N0, N1, K0]
std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, N2, K0};
std::vector<ck::index_t> b_ns_ks_strides{N0*N1*N2*K0, N1 * N2 * K0, N2 * K0, K0, 1};
// D[G, N0, M0, N1, M1, N2]
std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, N0, N1, N2};
std::vector<ck::index_t> d_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
// E[G, N0, M0, N1, M1, N2] 2, 0, 3, 1, 4
std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, N0, N1, N2};
std::vector<ck::index_t> e_ms_ns_strides{M0* M1* N0* N1* N2,
N1 * M1 * N2,
N2,
M0 * N1 * M1 * N2,
M1 * N2,
1};
"""
)
EXTRA_SHAPE_TEMPLATE_M3N2
=
jinja2
.
Template
(
"""
const int64_t G1 = p_dim0; // G1
const int64_t G2 = p_dim1; // G2
const int64_t G3 = p_dim2; // G3
ck::index_t M0 = M / G1 / G2;
ck::index_t M1 = G1;
ck::index_t M2 = G2;
ck::index_t N0 = G3;
ck::index_t N1 = N / G3;
ck::index_t K0 = K;
ck::index_t G = 1;
// A[M0, M1, M2, K0]
std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, M2, K0};
std::vector<ck::index_t> a_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
// B[N0, N1, K0]
std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, K0};
std::vector<ck::index_t> b_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
// D[M0, N0, M1, N1, M2]
std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, M2, N0, N1};
std::vector<ck::index_t> d_ms_ns_strides{N0*N1, 0, 0, 0, N1, 1};
// E[M0, N0, M1, N1, M2]
std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, M2, N0, N1};
std::vector<ck::index_t> e_ms_ns_strides{M0 * M1* M2 * N1* N0, N0* M1* N1, N1, M0* N0* M1* N1, M1 * N1, 1};
"""
)
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment