Commit 516bbdcb authored by Astha Rai's avatar Astha Rai
Browse files

initial push with templated dev op

parent fde6d274
#include "model_interface.h"
#include <iostream>
#include <unordered_map>
#include "model-generated.h"
#include "model_container.h"
// Important: don't let exceptions escape the functions below.
// They can cause problems when -fvisibility=hidden. But more
// importantly, they can crash the program if they try to cross
// the language boundary into Python.
#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \
try { \
__VA_ARGS__ \
} catch (const std::exception& e) { \
LOG(ERROR) << "Error: " << e.what(); \
return AITemplateError::AITemplateFailure; \
} catch (...) { \
LOG(ERROR) << "Unknown exception occurred."; \
return AITemplateError::AITemplateFailure; \
} \
return AITemplateError::AITemplateSuccess;
#define RETURN_ERROR_IF_NULL(var) \
if (var == nullptr) { \
LOG(ERROR) << "Variable " << #var << " can't be null"; \
return AITemplateError::AITemplateFailure; \
}
namespace ait {
namespace {
class DefaultAllocator : public AITemplateAllocator {
public:
void* Allocate(size_t n_bytes) override {
void* result;
DEVICE_CHECK(DeviceMalloc(&result, n_bytes));
return result;
}
void Free(void* ptr) override {
DEVICE_CHECK(FreeDeviceMemory(ptr));
}
};
class TrackingAllocator : public DefaultAllocator {
public:
void* Allocate(size_t n_bytes) override {
auto* result = DefaultAllocator::Allocate(n_bytes);
num_bytes_ += n_bytes;
return result;
}
size_t NumBytesAllocated() const {
return num_bytes_;
}
private:
size_t num_bytes_ = 0;
};
DefaultAllocator default_allocator;
} // namespace
} // namespace ait
extern "C" {
AITemplateError AITemplateModelContainerCreate(
AITemplateModelHandle* ret,
size_t num_runtimes,
AITemplateAllocator* allocator) {
if (num_runtimes == 0) {
LOG(ERROR) << "num_runtimes must be positive, but got 0";
return AITemplateError::AITemplateFailure;
}
RETURN_ERROR_IF_NULL(ret)
AITemplateAllocator& allocator_ref =
allocator == nullptr ? ait::default_allocator : *allocator;
CONVERT_EXCEPTION_TO_ERROR_CODE({
auto* m = ait::CreateModelContainer(num_runtimes, allocator_ref);
*ret = reinterpret_cast<AITemplateModelHandle>(m);
})
}
AITemplateError AITemplateModelContainerDelete(AITemplateModelHandle handle) {
RETURN_ERROR_IF_NULL(handle)
CONVERT_EXCEPTION_TO_ERROR_CODE({
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
delete m;
});
}
AITemplateError AITemplateModelContainerSetConstant(
AITemplateModelHandle handle,
const char* name,
const AITData* tensor) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(tensor)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SetConstant(name, *tensor); })
}
AITemplateError AITemplateModelContainerRun(
AITemplateModelHandle handle,
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
AITemplateStreamHandle stream_handle,
bool sync,
bool graph_mode,
int64_t** output_shapes_out) {
RETURN_ERROR_IF_NULL(handle)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({
m->Run(
inputs,
num_inputs,
outputs,
num_outputs,
stream,
sync,
graph_mode,
output_shapes_out);
})
}
AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
AITemplateModelHandle handle,
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
AITemplateStreamHandle stream_handle,
bool graph_mode,
int64_t** output_shapes_out) {
RETURN_ERROR_IF_NULL(handle)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({
m->RunWithOutputsOnHost(
inputs,
num_inputs,
outputs,
num_outputs,
stream,
graph_mode,
output_shapes_out);
})
}
AITemplateError AITemplateModelContainerBenchmark(
AITemplateModelHandle handle,
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
AITemplateStreamHandle stream_handle,
bool graph_mode,
size_t count,
size_t num_threads,
bool use_unique_stream_per_thread,
float* runtime_ms,
int64_t** output_shapes_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(runtime_ms)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({
*runtime_ms = m->Benchmark(
inputs,
num_inputs,
outputs,
num_outputs,
stream,
graph_mode,
count,
num_threads,
use_unique_stream_per_thread,
output_shapes_out);
})
}
AITemplateError AITemplateModelContainerGetNumInputs(
AITemplateModelHandle handle,
size_t* num_inputs_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(num_inputs_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_inputs_out = m->NumInputs(); })
}
AITemplateError AITemplateModelContainerGetInputName(
AITemplateModelHandle handle,
size_t input_idx,
const char** input_name_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(input_name_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE(
{ *input_name_out = m->InputName(input_idx); })
}
AITemplateError AITemplateModelContainerGetNumOutputs(
AITemplateModelHandle handle,
size_t* num_outputs_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(num_outputs_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_outputs_out = m->NumOutputs(); })
}
AITemplateError AITemplateModelContainerGetOutputName(
AITemplateModelHandle handle,
size_t output_idx,
const char** output_name_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(output_name_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE(
{ *output_name_out = m->OutputName(output_idx); })
}
AITemplateError AITemplateModelContainerGetMaximumOutputShape(
AITemplateModelHandle handle,
size_t output_idx,
AITemplateParamShape* shape_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(shape_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE(
{ *shape_out = m->MaxOutputShape(output_idx); })
}
AITemplateError AITemplateModelContainerGetOutputDtype(
AITemplateModelHandle handle,
size_t output_idx,
AITemplateDtype* dtype_out) {
RETURN_ERROR_IF_NULL(handle)
RETURN_ERROR_IF_NULL(dtype_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({ *dtype_out = m->OutputDtype(output_idx); })
}
AITemplateError AITemplateModelContainerGetNumRuntimes(
AITemplateModelHandle handle,
size_t* num_runtimes_out) {
RETURN_ERROR_IF_NULL(num_runtimes_out)
auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_runtimes_out = m->GetNumRuntimes(); })
}
AITemplateError AITemplateAllocatorCreate(
AITemplateAllocator** allocator_out,
AITemplateAllocatorType allocator_type) {
RETURN_ERROR_IF_NULL(allocator_out);
CONVERT_EXCEPTION_TO_ERROR_CODE({
switch (allocator_type) {
case AITemplateAllocatorType::kDefault:
*allocator_out = new ait::DefaultAllocator();
break;
case AITemplateAllocatorType::kTracking:
*allocator_out = new ait::TrackingAllocator();
break;
default:
throw std::runtime_error("Unrecognized allocator type");
}
});
}
AITemplateError AITemplateAllocatorDelete(AITemplateAllocator* allocator) {
RETURN_ERROR_IF_NULL(allocator);
delete allocator;
return AITemplateError::AITemplateSuccess;
}
AITemplateError AITemplateTrackingAllocatorGetNumBytes(
AITemplateAllocator* allocator,
size_t* num_bytes_out) {
RETURN_ERROR_IF_NULL(allocator);
RETURN_ERROR_IF_NULL(num_bytes_out);
CONVERT_EXCEPTION_TO_ERROR_CODE({
auto* tracking_allocator = dynamic_cast<ait::TrackingAllocator*>(allocator);
if (tracking_allocator == nullptr) {
throw std::runtime_error("Allocator was not a tracking allocator!");
}
*num_bytes_out = tracking_allocator->NumBytesAllocated();
});
}
} // extern "C"
\ No newline at end of file
#pragma once
#include <stddef.h>
#include <stdint.h>
#include <numeric>
#include <stdexcept>
#include <utility>
#include <vector>
// We compile all models with -fvisibility=hidden. Any symbols that need to be
// exposed in the final shared library must be declared with AIT_EXPORT to make
// them visible.
#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
#define AIT_EXPORT __attribute__((__visibility__("default")))
#else
#ifdef _WIN32
#define AIT_EXPORT __declspec(dllexport)
#else
#define AIT_EXPORT
#endif
#endif
struct AITemplateModelOpaque {};
using AITemplateModelHandle = AITemplateModelOpaque*;
enum class AITemplateError : int {
AITemplateSuccess = 0,
AITemplateFailure = 1,
};
struct AITemplateParamShape {
AITemplateParamShape() : shape_data(nullptr), size(0) {}
AITemplateParamShape(const int64_t* shape_data_in, size_t size_in)
: shape_data(shape_data_in), size(size_in) {}
const int64_t* shape_data;
size_t size;
size_t Numel() const {
return std::accumulate(
shape_data, shape_data + size, 1, std::multiplies<int64_t>());
}
};
enum class AITemplateDtype {
kUnset = 0,
kHalf,
kFloat,
kInt,
kLong,
kBool,
};
struct AITData {
AITData() : ptr(nullptr), dtype(AITemplateDtype::kUnset) {}
AITData(
void* ptr_in,
const AITemplateParamShape& shape_in,
AITemplateDtype dtype_in)
: ptr(ptr_in), shape(shape_in), dtype(dtype_in) {}
void* ptr;
AITemplateParamShape shape;
AITemplateDtype dtype;
};
inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
switch (dtype) {
case AITemplateDtype::kHalf:
return 2;
case AITemplateDtype::kFloat:
return 4;
case AITemplateDtype::kInt:
return 4;
case AITemplateDtype::kLong:
return 8;
case AITemplateDtype::kBool:
return 1;
case AITemplateDtype::kUnset:
throw std::runtime_error("Unset dtype has no size!");
}
}
struct AITemplateStreamOpaque {};
using AITemplateStreamHandle = AITemplateStreamOpaque*;
// Allocator to use for GPU mallocs and frees. Allocations will only happen
// when the ModelContainer is created.
class AITemplateAllocator {
public:
virtual void* Allocate(size_t nbytes) = 0;
virtual void Free(void* ptr) = 0;
virtual ~AITemplateAllocator() = default;
};
// Some custom allocators are provided. They can be created by passing
// an enum into the AITemplateAllocatorCreate() function.
enum class AITemplateAllocatorType {
// The default allocator just uses the backend's default malloc/free.
kDefault = 0,
// The tracking allocator is like the default allocator, but it keeps
// track of how many bytes it has allocated. Mainly used for testing.
kTracking,
};
extern "C" {
// Create a ModelContainer. See model_container.h for all the details.
// Some important high-level notes:
// * If allocator is null, a default allocator is used (forwards to
// {cuda/hip}{Malloc/Free}).
// * We assume that the allocator lives at least as long as the ModelContainer.
AIT_EXPORT AITemplateError AITemplateModelContainerCreate(
AITemplateModelHandle* ret,
size_t num_runtimes,
AITemplateAllocator* allocator = nullptr);
AIT_EXPORT AITemplateError
AITemplateModelContainerDelete(AITemplateModelHandle handle);
AIT_EXPORT AITemplateError AITemplateModelContainerSetConstant(
AITemplateModelHandle handle,
const char* name,
const AITData* tensor);
AIT_EXPORT AITemplateError AITemplateModelContainerRun(
AITemplateModelHandle handle,
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
AITemplateStreamHandle stream_handle,
bool sync,
bool graph_mode,
int64_t** output_shapes_out);
// Like AITemplateModelContainerRun, but expects outputs to be allocated on the
// host. Does an extra sync/copy at the end to copy them over. Warning: don't
// use this! It's not optimal with respect to performance. It's here for use by
// internal constant folding passes.
AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
AITemplateModelHandle handle,
const AITData* inputs,
size_t num_inputs,
AITData* outputs,
size_t num_outputs,
AITemplateStreamHandle stream_handle,
bool graph_mode,
int64_t** output_shapes_out);
AIT_EXPORT AITemplateError AITemplateModelContainerBenchmark(
AITemplateModelHandle handle,
const AITData* inputs,
size_t num_inputs,
AITData* ouputs,
size_t num_outputs,
AITemplateStreamHandle stream_handle,
bool graph_mode,
size_t count,
size_t num_threads,
bool use_unique_stream_per_thread,
float* runtime_ms,
int64_t** output_shapes_out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetNumInputs(
AITemplateModelHandle handle,
size_t* num_inputs_out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetInputName(
AITemplateModelHandle handle,
size_t input_idx,
const char** input_name_out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetNumOutputs(
AITemplateModelHandle handle,
size_t* num_outputs_out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetOutputName(
AITemplateModelHandle handle,
size_t output_idx,
const char** output_name_out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetMaximumOutputShape(
AITemplateModelHandle handle,
size_t output_idx,
AITemplateParamShape* shape_out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetOutputDtype(
AITemplateModelHandle handle,
size_t output_idx,
AITemplateDtype* out);
AIT_EXPORT AITemplateError AITemplateModelContainerGetNumRuntimes(
AITemplateModelHandle handle,
size_t* num_runtimes_out);
AIT_EXPORT AITemplateError AITemplateAllocatorCreate(
AITemplateAllocator** allocator_out,
AITemplateAllocatorType allocator_type);
AIT_EXPORT AITemplateError
AITemplateAllocatorDelete(AITemplateAllocator* allocator_out);
// Get the number of bytes allocated; mainly used for testing.
AIT_EXPORT AITemplateError AITemplateTrackingAllocatorGetNumBytes(
AITemplateAllocator* allocator,
size_t* num_bytes_out);
} // extern "C"
\ No newline at end of file
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Normalization common codegen for ROCM.
"""
import os
import re
from hashlib import sha1
from typing import Any, Dict, OrderedDict
import jinja2
from ...target import Target
FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
INSTANCE_TEMPLATE = jinja2.Template(
"""
{{config}}
using {{name}} = {{ config_name }};
"""
)
ARGS_PARSE_TEMPLATE = jinja2.Template(
"""
{% for idx in range(rank) %}
const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
{% endfor %}
"""
)
STRUCTS_DEF_TEMPLATE = jinja2.Template(
"""
struct ProfilerMemoryPool {
ProfilerMemoryPool() {
std::random_device rd;
gen = std::mt19937(rd());
uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
offsets.reserve(512);
strides.reserve(512);
copies.reserve(512);
ptrs.reserve(512);
}
~ProfilerMemoryPool() {
for(int i = 0; i < ptrs.size(); i++){
hipFree(ptrs[i]);
}
}
template <typename DType>
DType* AllocateGaussianTensor(int64_t size) {
size_t length = size * sizeof(DType);
DType *d_x;
hipMalloc(&d_x, length);
float mean = 0.0f;
float stddev = 1.0f;
uint64_t seed = uniform_dist(gen);
rocrand_set_seed(generator, seed);
rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
return d_x;
}
ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
return reinterpret_cast<ck::half_t*>(
AllocateGaussianTensor<ck::half_t>(size));
}
int AllocateHalfTensor(int64_t size, int64_t copy) {
offsets.push_back(0);
strides.push_back(size);
copies.push_back(copy);
auto ptr = AllocateHalfGaussianTensor(size * copy);
ptrs.push_back(reinterpret_cast<void*>(ptr));
return ptrs.size() - 1;
}
ck::half_t* RequestHalfTensorByIdx(int idx) {
auto copy = copies.at(idx);
auto offset = offsets.at(idx);
auto stride = strides.at(idx);
ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
ptr += offset;
offset += stride;
if (offset == copy * stride) {
offset = 0;
}
offsets[idx] = offset;
return ptr;
}
std::vector<int64_t> offsets;
std::vector<int64_t> strides;
std::vector<int64_t> copies;
std::vector<void*> ptrs;
std::mt19937 gen;
std::uniform_int_distribution<int64_t> uniform_dist;
rocrand_generator generator;
};
// hack for DeviceMem linking error
// TODO fix this by making CK a header-only lib
// <<< hack begin
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) const
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p) const
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl() {
hipGetErrorString(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd));
}
~KernelTimerImpl() {
hipGetErrorString(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd));
}
void Start() {
hipGetErrorString(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr));
}
void End() {
hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const {
float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
// >>> hack end
"""
)
PROFILER_TEMPLATE = jinja2.Template(
"""
size_t GLOBAL_WORKSPACE_SIZE = 0;
{{op_func}}
{{structs_def}}
int main(int argc, char** argv) {
{{args_parse}}
auto memory_pool = std::make_unique<ProfilerMemoryPool>();
hipStream_t stream = nullptr;
{{tensor_decl}}
// warmup
for(int i = 0; i < 3; ++i) {
{{func_call}}
}
// run
KernelTimerImpl timer;
timer.Start();
for(int i = 0; i < 5; ++i) {
{{func_call}}
}
timer.End();
std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
}
"""
)
FUNC_TEMPLATE = jinja2.Template(
"""
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <random>
#include <rocrand/rocrand.h>
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/reduction_operator.hpp"
{{extra_headers}}
{{extra_code}}
{{instances_decl}}
{{func_signature}}
{
{{shape_eval}}
{{exec_paths}}
}
"""
)
FUNC_CALL_TEMPLATE = jinja2.Template(
"""
{{indent}}{{func_name}}(
{{indent}} {{input}},
{{indent}} {{output}},
{% for name in input_dim_names %}
{{indent}} const_cast<int64_t *>(&{{name}}),
{% endfor %}
{{indent}} stream
{{indent}});
"""
)
def extract_config(func_attrs):
"""Extract (operation name, operation instance) pair
from all operation candidates.
Parameters
----------
op_kind : ck_lib.library.OperationKind
Operation kind.
extra_kind : ck_lib.library.[AnyKind]
Used to as extra flag to distinguish kernels.
E.g. bias_add_relu vs. add_relu_bias
f_prop_op: function
Used to filter operation.
Returns
-------
Dict
Extracted (operation name, operation instance) pair.
"""
import ck_lib
op_kind = ck_lib.library.OperationKind.Softmax
extra_kind = len(func_attrs["inputs"][0]._attrs["shape"])
extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
softmax_ops = OrderedDict()
for key, value in extract_ops:
softmax_ops[key] = value[0]
func_attrs["op_instance"] = softmax_ops
def emit_instance(op):
"""Emit instance"""
import ck_lib # noqa: F401
op_def = op.emit()
return op_def
def extract_config_name(config):
"""Extract configuration names.
Parameters
----------
config : str
Configuration as a string in the format of 'using model = xxx'.
Returns
-------
str
Extracted name from the statement, e.g. 'model' for 'using model = xxx'.
Raises
------
RuntimeError
Invalid config.
"""
pattern = re.compile(r"\s*using\s(.*?)\s=")
decl = config.split("\n")[1]
match = pattern.match(decl)
if match is None:
raise RuntimeError("Invalid config: \n" + config)
return match.groups()[0]
def gen_profiler(
func_attrs: Dict[str, Any],
workdir: str,
rank: int,
shape_eval_template: jinja2.Template,
exec_template: jinja2.Template,
tensor_decl_template: jinja2.Template,
extra_header_template: jinja2.Template,
get_func_signature: Any,
extra_code: str = "",
func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
indent: str = " ",
) -> str:
"""Generates standalone executables for profiler.
Parameters
----------
func_attrs : Dict
Operation attributes.
workdir : str
Directory to store the generated outputs.
rank: int
Rank of the input tensor. If using [M, N] in exec_key, the rank here
must be 2 because if implies that the inputs are reshaped for profiling.
For code gen, the real shapes are used.
exec_template : jinja2.Template
Execution block template.
tensor_decl_template: jinja2.Template
Tensor declaration template.
extra_header_template : jinja2.Template
Extra header template.
indent : str, optional
Indent for codegen, target dependent e.g. C++, python, etc., by default " ".
"""
op_type = func_attrs["op"]
shape_eval = shape_eval_template.render(rank=rank) if shape_eval_template else ""
eps = func_attrs.get("eps", "1e-5")
op_instance = func_attrs["op_instance"]
file_pairs = []
for op_name, op in op_instance.items():
config = emit_instance(op)
config_name = extract_config_name(config)
instances = INSTANCE_TEMPLATE.render(
name="DeviceInstance", config_name=config_name, config=config
)
exe_path = exec_template.render(
instance="DeviceInstance",
dtype="void",
reduce_dims=rank - 1,
rank=rank,
eps=eps,
)
op_func = FUNC_TEMPLATE.render(
instances_decl=instances,
func_signature=get_func_signature(func_attrs),
shape_eval=shape_eval,
exec_paths=exe_path,
extra_headers=extra_header_template.render(),
extra_code=extra_code,
)
structs_def = STRUCTS_DEF_TEMPLATE.render()
args_parse = ARGS_PARSE_TEMPLATE.render(rank=rank)
tensor_decl = tensor_decl_template.render(rank=rank)
input_dim_names = [f"in_{i}" for i in range(rank)]
func_call = func_call_template.render(
func_name=func_attrs["name"],
input="(void *) memory_pool->RequestHalfTensorByIdx(0)",
gamma="(void *) memory_pool->RequestHalfTensorByIdx(2)",
beta="(void *) memory_pool->RequestHalfTensorByIdx(3)",
output="(void *) memory_pool->RequestHalfTensorByIdx(1)",
input_dim_names=input_dim_names,
indent=indent,
)
code = PROFILER_TEMPLATE.render(
op_func=op_func,
structs_def=structs_def,
args_parse=args_parse,
tensor_decl=tensor_decl,
func_call=func_call,
)
prefix = os.path.join(workdir, "profiler", op_type)
if not os.path.exists(prefix):
os.makedirs(prefix)
src_path = os.path.join(prefix, op_name + ".cpp")
obj_path = os.path.join(prefix, op_name)
if os.path.exists(obj_path):
continue
with open(src_path, "w") as fo:
fo.write(code)
file_pairs.append((src_path, obj_path))
return file_pairs
# no longer used by layernorm
def gen_function(
func_attrs: Dict[str, Any],
exec_template: jinja2.Template,
extra_header_template: jinja2.Template,
get_func_signature: Any,
) -> str:
"""Generate function body.
Parameters
----------
func_attrs : Dict
Operation attributes.
exec_template : jinja2.Template
Execution block template.
extra_header_template : jinja2.Template
Extra header template.
Returns
-------
str
The rendered template of generated function body.
"""
shapes = func_attrs["inputs"][0]._attrs["shape"]
rank = len(shapes)
exec_path = func_attrs["exec_path"]
op_instance = func_attrs["op_instance"]
inst_def_flag = set()
instances = {}
instance_decl = ""
for exec_item in exec_path.values():
fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
algo = exec_item.algo
if algo not in inst_def_flag:
config = emit_instance(op_instance[algo])
inst_def_flag.add(algo)
else:
config = ""
inst = INSTANCE_TEMPLATE.render(
config=config, name=fname, config_name=extract_config_name(config)
)
instances[exec_item.exec_cond] = inst
instance_decl += inst
exec_cond_template = func_attrs["exec_cond_template"]
exec_paths = ""
for key, _ in instances.items():
fname = "f" + sha1(key.encode()).hexdigest()
program = exec_template.render(
instance=fname, dtype="void", reduce_dims=rank - 1, rank=rank
)
cond_vars = re.findall(r"\S+(?= >=)", key)
cond_vars += re.findall(r"\S+(?= ==)", key)
cond = key
for i, var in enumerate(cond_vars):
cond = cond.replace(var + " ", "*in_" + str(i))
exec_inst = exec_cond_template.render(indent=" ", cond=cond, program=program)
exec_paths += exec_inst
return FUNC_TEMPLATE.render(
instances_decl=instance_decl,
func_signature=get_func_signature(func_attrs),
exec_paths=exec_paths,
extra_headers=extra_header_template.render(),
)
def gen_function_call(func_attrs, indent=" "):
"""Generates function call.
Parameters
----------
func_attrs : Dict
Stores the operation attributes.
indent : str, optional
Indent for codegen, target dependent e.g. C++, python, etc., by default " ".
Returns
-------
str
The rendered template of generated function call.
"""
input_name = FUNC_CALL_PARAM_TEMPLATE.render(
name=func_attrs["inputs"][0]._attrs["name"]
)
output_name = FUNC_CALL_PARAM_TEMPLATE.render(
name=func_attrs["outputs"][0]._attrs["name"]
)
shapes = func_attrs["inputs"][0]._attrs["shape"]
input_dim_names = [shape._attrs["name"] for shape in shapes]
return FUNC_CALL_TEMPLATE.render(
func_name=func_attrs["name"],
input=input_name,
output=output_name,
input_dim_names=input_dim_names,
indent=indent,
)
\ No newline at end of file
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import jinja2
EXTRA_SHAPE_TEMPLATE = jinja2.Template(
"""
{{indent}}const int64_t stride_a = *a_dim1;
{{indent}}const int64_t stride_b = *b_dim1;
{{indent}}const int64_t stride_c = *c_dim1;
ck::index_t M0 = M / G1 / G2;
ck::index_t M1 = G1;
ck::index_t M2 = G2;
ck::index_t N0 = G3;
ck::index_t N1 = N / G3;
// GEMM shape
//ck::index_t M = M0 * M1 * M2;
//ck::index_t N = N0 * N1;
//ck::index_t K = 128;
//ck::index_t stride_A = K;
//ck::index_t stride_B = K;
// E = [M0, N0, M1, N1, M2]
/* 0, 3, 1, 4, 2
ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
ck::index_t stride_E_M1 = N1 * M2;
ck::index_t stride_E_M2 = 1;
ck::index_t stride_E_N0 = M1 * N1 * M2;
ck::index_t stride_E_N1 = M2;
*/
// E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
ck::index_t stride_E_M0 = N0* M1* N1;
ck::index_t stride_E_M1 = N1;
ck::index_t stride_E_M2 = M0* N0* M1* N1;
ck::index_t stride_E_N0 = M1 * N1;
ck::index_t stride_E_N1 = 1;
// D = [0, N0, 0, N1, 0]
ck::index_t stride_D_M0 = 0;
ck::index_t stride_D_M1 = 0;
ck::index_t stride_D_M2 = 0;
ck::index_t stride_D_N0 = N1;
ck::index_t stride_D_N1 = 1;
"""
)
EXTRA_SHAPE_TEMPLATE_M2N3 = jinja2.Template(
"""
const int64_t G1 = p_dim0; // G1
const int64_t G2 = p_dim1; // G2
const int64_t G3 = p_dim2; // G3
ck::index_t M0 = M / G1;
ck::index_t M1 = G1;
ck::index_t N0 = G2;
ck::index_t N1 = G3;
ck::index_t N2 = N / G2 / G3;
ck::index_t K0 = K;
ck::index_t G = 1;
// A[G, M0, M1, M2, K0]
std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, K0};
std::vector<ck::index_t> a_ms_ks_strides{M0*M1*K0, M1 * K0, K0, 1};
// B[G, N0, N1, K0]
std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, N2, K0};
std::vector<ck::index_t> b_ns_ks_strides{N0*N1*N2*K0, N1 * N2 * K0, N2 * K0, K0, 1};
// D[G, N0, M0, N1, M1, N2]
std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, N0, N1, N2};
std::vector<ck::index_t> d_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
// E[G, N0, M0, N1, M1, N2] 2, 0, 3, 1, 4
std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, N0, N1, N2};
std::vector<ck::index_t> e_ms_ns_strides{M0* M1* N0* N1* N2,
N1 * M1 * N2,
N2,
M0 * N1 * M1 * N2,
M1 * N2,
1};
"""
)
EXTRA_SHAPE_TEMPLATE_M3N2 = jinja2.Template(
"""
const int64_t G1 = p_dim0; // G1
const int64_t G2 = p_dim1; // G2
const int64_t G3 = p_dim2; // G3
ck::index_t M0 = M / G1 / G2;
ck::index_t M1 = G1;
ck::index_t M2 = G2;
ck::index_t N0 = G3;
ck::index_t N1 = N / G3;
ck::index_t K0 = K;
ck::index_t G = 1;
// A[M0, M1, M2, K0]
std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, M2, K0};
std::vector<ck::index_t> a_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
// B[N0, N1, K0]
std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, K0};
std::vector<ck::index_t> b_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
// D[M0, N0, M1, N1, M2]
std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, M2, N0, N1};
std::vector<ck::index_t> d_ms_ns_strides{N0*N1, 0, 0, 0, N1, 1};
// E[M0, N0, M1, N1, M2]
std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, M2, N0, N1};
std::vector<ck::index_t> e_ms_ns_strides{M0 * M1* M2 * N1* N0, N0* M1* N1, N1, M0* N0* M1* N1, M1 * N1, 1};
"""
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment