initial push with templated dev op

516bbdcb · Astha Rai · fde6d274 · 516bbdcb · 516bbdcb · 516bbdcb
Commit 516bbdcb authored Apr 03, 2023 by Astha Rai
4 changed files
--- a/python/AIT implementation/sample files/model_interface.cpp
+++ b/python/AIT implementation/sample files/model_interface.cpp
+#include "model_interface.h"
+#include <iostream>
+#include <unordered_map>
+#include "model-generated.h"
+#include "model_container.h"
+
+// Important: don't let exceptions escape the functions below.
+// They can cause problems when -fvisibility=hidden. But more
+// importantly, they can crash the program if they try to cross
+// the language boundary into Python.
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)     \
+  try {                                          \
+    __VA_ARGS__                                  \
+  } catch (const std::exception& e) {            \
+    LOG(ERROR) << "Error: " << e.what();         \
+    return AITemplateError::AITemplateFailure;   \
+  } catch (...) {                                \
+    LOG(ERROR) << "Unknown exception occurred."; \
+    return AITemplateError::AITemplateFailure;   \
+  }                                              \
+  return AITemplateError::AITemplateSuccess;
+
+#define RETURN_ERROR_IF_NULL(var)                          \
+  if (var == nullptr) {                                    \
+    LOG(ERROR) << "Variable " << #var << " can't be null"; \
+    return AITemplateError::AITemplateFailure;             \
+  }
+
+namespace ait {
+namespace {
+class DefaultAllocator : public AITemplateAllocator {
+ public:
+  void* Allocate(size_t n_bytes) override {
+    void* result;
+    DEVICE_CHECK(DeviceMalloc(&result, n_bytes));
+    return result;
+  }
+
+  void Free(void* ptr) override {
+    DEVICE_CHECK(FreeDeviceMemory(ptr));
+    }
+};
+
+class TrackingAllocator : public DefaultAllocator {
+ public:
+  void* Allocate(size_t n_bytes) override {
+    auto* result = DefaultAllocator::Allocate(n_bytes);
+    num_bytes_ += n_bytes;
+    return result;
+  }
+
+  size_t NumBytesAllocated() const {
+    return num_bytes_;
+  }
+
+ private:
+  size_t num_bytes_ = 0;
+};
+
+DefaultAllocator default_allocator;
+} // namespace
+} // namespace ait
+
+extern "C" {
+
+AITemplateError AITemplateModelContainerCreate(
+    AITemplateModelHandle* ret,
+    size_t num_runtimes,
+    AITemplateAllocator* allocator) {
+  if (num_runtimes == 0) {
+    LOG(ERROR) << "num_runtimes must be positive, but got 0";
+    return AITemplateError::AITemplateFailure;
+  }
+  RETURN_ERROR_IF_NULL(ret)
+  AITemplateAllocator& allocator_ref =
+      allocator == nullptr ? ait::default_allocator : *allocator;
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* m = ait::CreateModelContainer(num_runtimes, allocator_ref);
+    *ret = reinterpret_cast<AITemplateModelHandle>(m);
+  })
+  }
+
+AITemplateError AITemplateModelContainerDelete(AITemplateModelHandle handle) {
+  RETURN_ERROR_IF_NULL(handle)
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+    delete m;
+  });
+}
+
+AITemplateError AITemplateModelContainerSetConstant(
+    AITemplateModelHandle handle,
+    const char* name,
+    const AITData* tensor) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(tensor)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SetConstant(name, *tensor); })
+}
+
+AITemplateError AITemplateModelContainerRun(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool sync,
+    bool graph_mode,
+    int64_t** output_shapes_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->Run(
+        inputs,
+        num_inputs,
+        outputs,
+        num_outputs,
+        stream,
+        sync,
+        graph_mode,
+        output_shapes_out);
+  })
+}
+
+AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    int64_t** output_shapes_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->RunWithOutputsOnHost(
+        inputs,
+        num_inputs,
+        outputs,
+        num_outputs,
+        stream,
+        graph_mode,
+        output_shapes_out);
+  })
+}
+
+AITemplateError AITemplateModelContainerBenchmark(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    size_t count,
+    size_t num_threads,
+    bool use_unique_stream_per_thread,
+    float* runtime_ms,
+    int64_t** output_shapes_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(runtime_ms)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *runtime_ms = m->Benchmark(
+        inputs,
+        num_inputs,
+        outputs,
+        num_outputs,
+        stream,
+        graph_mode,
+        count,
+        num_threads,
+        use_unique_stream_per_thread,
+        output_shapes_out);
+  })
+}
+
+AITemplateError AITemplateModelContainerGetNumInputs(
+    AITemplateModelHandle handle,
+    size_t* num_inputs_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(num_inputs_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_inputs_out = m->NumInputs(); })
+}
+
+AITemplateError AITemplateModelContainerGetInputName(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    const char** input_name_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(input_name_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *input_name_out = m->InputName(input_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetNumOutputs(
+    AITemplateModelHandle handle,
+    size_t* num_outputs_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(num_outputs_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_outputs_out = m->NumOutputs(); })
+}
+
+AITemplateError AITemplateModelContainerGetOutputName(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    const char** output_name_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(output_name_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *output_name_out = m->OutputName(output_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetMaximumOutputShape(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateParamShape* shape_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(shape_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *shape_out = m->MaxOutputShape(output_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetOutputDtype(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateDtype* dtype_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(dtype_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *dtype_out = m->OutputDtype(output_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetNumRuntimes(
+    AITemplateModelHandle handle,
+    size_t* num_runtimes_out) {
+  RETURN_ERROR_IF_NULL(num_runtimes_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_runtimes_out = m->GetNumRuntimes(); })
+}
+
+AITemplateError AITemplateAllocatorCreate(
+    AITemplateAllocator** allocator_out,
+    AITemplateAllocatorType allocator_type) {
+  RETURN_ERROR_IF_NULL(allocator_out);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    switch (allocator_type) {
+      case AITemplateAllocatorType::kDefault:
+        *allocator_out = new ait::DefaultAllocator();
+        break;
+      case AITemplateAllocatorType::kTracking:
+        *allocator_out = new ait::TrackingAllocator();
+        break;
+      default:
+        throw std::runtime_error("Unrecognized allocator type");
+    }
+  });
+}
+
+AITemplateError AITemplateAllocatorDelete(AITemplateAllocator* allocator) {
+  RETURN_ERROR_IF_NULL(allocator);
+  delete allocator;
+  return AITemplateError::AITemplateSuccess;
+}
+
+AITemplateError AITemplateTrackingAllocatorGetNumBytes(
+    AITemplateAllocator* allocator,
+    size_t* num_bytes_out) {
+  RETURN_ERROR_IF_NULL(allocator);
+  RETURN_ERROR_IF_NULL(num_bytes_out);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* tracking_allocator = dynamic_cast<ait::TrackingAllocator*>(allocator);
+    if (tracking_allocator == nullptr) {
+      throw std::runtime_error("Allocator was not a tracking allocator!");
+    }
+    *num_bytes_out = tracking_allocator->NumBytesAllocated();
+  });
+}
+
+} // extern "C"
\ No newline at end of file
--- a/python/AIT implementation/sample files/model_interface.h
+++ b/python/AIT implementation/sample files/model_interface.h
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+// We compile all models with -fvisibility=hidden. Any symbols that need to be
+// exposed in the final shared library must be declared with AIT_EXPORT to make
+// them visible.
+
+#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
+#define AIT_EXPORT __attribute__((__visibility__("default")))
+#else
+#ifdef _WIN32
+#define AIT_EXPORT __declspec(dllexport)
+#else
+#define AIT_EXPORT
+#endif
+#endif
+
+struct AITemplateModelOpaque {};
+using AITemplateModelHandle = AITemplateModelOpaque*;
+
+enum class AITemplateError : int {
+  AITemplateSuccess = 0,
+  AITemplateFailure = 1,
+};
+
+struct AITemplateParamShape {
+  AITemplateParamShape() : shape_data(nullptr), size(0) {}
+  AITemplateParamShape(const int64_t* shape_data_in, size_t size_in)
+      : shape_data(shape_data_in), size(size_in) {}
+
+  const int64_t* shape_data;
+  size_t size;
+
+  size_t Numel() const {
+    return std::accumulate(
+        shape_data, shape_data + size, 1, std::multiplies<int64_t>());
+  }
+};
+
+enum class AITemplateDtype {
+  kUnset = 0,
+  kHalf,
+  kFloat,
+  kInt,
+  kLong,
+  kBool,
+};
+
+struct AITData {
+  AITData() : ptr(nullptr), dtype(AITemplateDtype::kUnset) {}
+
+  AITData(
+      void* ptr_in,
+      const AITemplateParamShape& shape_in,
+      AITemplateDtype dtype_in)
+      : ptr(ptr_in), shape(shape_in), dtype(dtype_in) {}
+
+  void* ptr;
+  AITemplateParamShape shape;
+  AITemplateDtype dtype;
+};
+
+inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
+  switch (dtype) {
+    case AITemplateDtype::kHalf:
+      return 2;
+    case AITemplateDtype::kFloat:
+      return 4;
+    case AITemplateDtype::kInt:
+      return 4;
+    case AITemplateDtype::kLong:
+      return 8;
+    case AITemplateDtype::kBool:
+      return 1;
+    case AITemplateDtype::kUnset:
+      throw std::runtime_error("Unset dtype has no size!");
+  }
+}
+
+struct AITemplateStreamOpaque {};
+using AITemplateStreamHandle = AITemplateStreamOpaque*;
+
+// Allocator to use for GPU mallocs and frees. Allocations will only happen
+// when the ModelContainer is created.
+class AITemplateAllocator {
+ public:
+  virtual void* Allocate(size_t nbytes) = 0;
+  virtual void Free(void* ptr) = 0;
+
+  virtual ~AITemplateAllocator() = default;
+};
+
+// Some custom allocators are provided. They can be created by passing
+// an enum into the AITemplateAllocatorCreate() function.
+enum class AITemplateAllocatorType {
+  // The default allocator just uses the backend's default malloc/free.
+  kDefault = 0,
+  // The tracking allocator is like the default allocator, but it keeps
+  // track of how many bytes it has allocated. Mainly used for testing.
+  kTracking,
+};
+
+extern "C" {
+
+// Create a ModelContainer. See model_container.h for all the details.
+// Some important high-level notes:
+// * If allocator is null, a default allocator is used (forwards to
+//   {cuda/hip}{Malloc/Free}).
+// * We assume that the allocator lives at least as long as the ModelContainer.
+AIT_EXPORT AITemplateError AITemplateModelContainerCreate(
+    AITemplateModelHandle* ret,
+    size_t num_runtimes,
+    AITemplateAllocator* allocator = nullptr);
+
+AIT_EXPORT AITemplateError
+AITemplateModelContainerDelete(AITemplateModelHandle handle);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetConstant(
+    AITemplateModelHandle handle,
+    const char* name,
+    const AITData* tensor);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerRun(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool sync,
+    bool graph_mode,
+    int64_t** output_shapes_out);
+
+// Like AITemplateModelContainerRun, but expects outputs to be allocated on the
+// host. Does an extra sync/copy at the end to copy them over. Warning: don't
+// use this! It's not optimal with respect to performance. It's here for use by
+// internal constant folding passes.
+AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    int64_t** output_shapes_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerBenchmark(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* ouputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    size_t count,
+    size_t num_threads,
+    bool use_unique_stream_per_thread,
+    float* runtime_ms,
+    int64_t** output_shapes_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumInputs(
+    AITemplateModelHandle handle,
+    size_t* num_inputs_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetInputName(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    const char** input_name_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumOutputs(
+    AITemplateModelHandle handle,
+    size_t* num_outputs_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetOutputName(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    const char** output_name_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetMaximumOutputShape(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateParamShape* shape_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetOutputDtype(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateDtype* out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumRuntimes(
+    AITemplateModelHandle handle,
+    size_t* num_runtimes_out);
+
+AIT_EXPORT AITemplateError AITemplateAllocatorCreate(
+    AITemplateAllocator** allocator_out,
+    AITemplateAllocatorType allocator_type);
+
+AIT_EXPORT AITemplateError
+AITemplateAllocatorDelete(AITemplateAllocator* allocator_out);
+
+// Get the number of bytes allocated; mainly used for testing.
+AIT_EXPORT AITemplateError AITemplateTrackingAllocatorGetNumBytes(
+    AITemplateAllocator* allocator,
+    size_t* num_bytes_out);
+
+} // extern "C"
\ No newline at end of file
--- a/python/AIT implementation/sample files/norm_common.py
+++ b/python/AIT implementation/sample files/norm_common.py
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Normalization common codegen for ROCM.
+"""
+
+import os
+import re
+from hashlib import sha1
+from typing import Any, Dict, OrderedDict
+
+import jinja2
+
+from ...target import Target
+
+FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = {{ config_name }};
+"""
+)
+
+ARGS_PARSE_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(rank) %}
+  const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
+{% endfor %}
+"""
+)
+
+
+STRUCTS_DEF_TEMPLATE = jinja2.Template(
+    """
+
+struct ProfilerMemoryPool {
+  ProfilerMemoryPool() {
+    std::random_device rd;
+    gen = std::mt19937(rd());
+    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
+    offsets.reserve(512);
+    strides.reserve(512);
+    copies.reserve(512);
+    ptrs.reserve(512);
+  }
+  ~ProfilerMemoryPool() {
+    for(int i = 0; i < ptrs.size(); i++){
+      hipFree(ptrs[i]);
+    }
+  }
+
+  template <typename DType>
+  DType* AllocateGaussianTensor(int64_t size) {
+    size_t length = size * sizeof(DType);
+    DType *d_x;
+    hipMalloc(&d_x, length);
+
+    float mean = 0.0f;
+    float stddev = 1.0f;
+    uint64_t seed = uniform_dist(gen);
+    rocrand_set_seed(generator, seed);
+    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
+    return d_x;
+  }
+
+  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
+    return reinterpret_cast<ck::half_t*>(
+        AllocateGaussianTensor<ck::half_t>(size));
+  }
+
+  int AllocateHalfTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  ck::half_t* RequestHalfTensorByIdx(int idx) {
+    auto copy = copies.at(idx);
+    auto offset = offsets.at(idx);
+    auto stride = strides.at(idx);
+    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
+    ptr += offset;
+    offset += stride;
+    if (offset == copy * stride) {
+        offset = 0;
+    }
+    offsets[idx] = offset;
+    return ptr;
+  }
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> copies;
+  std::vector<void*> ptrs;
+  std::mt19937 gen;
+  std::uniform_int_distribution<int64_t> uniform_dist;
+  rocrand_generator generator;
+};
+
+// hack for DeviceMem linking error
+// TODO fix this by making CK a header-only lib
+// <<< hack begin
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
+void DeviceMem::ToDevice(const void* p) const
+{
+  hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p) const
+{
+  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+struct KernelTimerImpl
+{
+  KernelTimerImpl() {
+    hipGetErrorString(hipEventCreate(&mStart));
+    hipGetErrorString(hipEventCreate(&mEnd));
+  }
+  ~KernelTimerImpl() {
+    hipGetErrorString(hipEventDestroy(mStart));
+    hipGetErrorString(hipEventDestroy(mEnd));
+  }
+  void Start() {
+    hipGetErrorString(hipDeviceSynchronize());
+    hipGetErrorString(hipEventRecord(mStart, nullptr));
+  }
+  void End() {
+    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+    hipGetErrorString(hipEventSynchronize(mEnd));
+  }
+  float GetElapsedTime() const {
+    float time;
+    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+    return time;
+  }
+  hipEvent_t mStart, mEnd;
+};
+// >>> hack end
+
+"""
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+{{op_func}}
+
+{{structs_def}}
+
+int main(int argc, char** argv) {
+  {{args_parse}}
+  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  hipStream_t stream = nullptr;
+  {{tensor_decl}}
+  // warmup
+  for(int i = 0; i < 3; ++i) {
+    {{func_call}}
+  }
+  // run
+  KernelTimerImpl timer;
+  timer.Start();
+  for(int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  timer.End();
+  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
+  std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
+}
+"""
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <random>
+#include <rocrand/rocrand.h>
+#include "include/ck/utility/print.hpp"
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/reduction_operator.hpp"
+{{extra_headers}}
+
+{{extra_code}}
+
+{{instances_decl}}
+
+{{func_signature}}
+{
+{{shape_eval}}
+{{exec_paths}}
+}
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{input}},
+{{indent}}   {{output}},
+{% for name in input_dim_names %}
+{{indent}}    const_cast<int64_t *>(&{{name}}),
+{% endfor %}
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+def extract_config(func_attrs):
+    """Extract (operation name, operation instance) pair
+    from all operation candidates.
+
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    f_prop_op: function
+        Used to filter operation.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.OperationKind.Softmax
+    extra_kind = len(func_attrs["inputs"][0]._attrs["shape"])
+    extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
+    softmax_ops = OrderedDict()
+    for key, value in extract_ops:
+        softmax_ops[key] = value[0]
+    func_attrs["op_instance"] = softmax_ops
+
+
+def emit_instance(op):
+    """Emit instance"""
+    import ck_lib  # noqa: F401
+
+    op_def = op.emit()
+    return op_def
+
+
+def extract_config_name(config):
+    """Extract configuration names.
+
+    Parameters
+    ----------
+    config : str
+        Configuration as a string in the format of 'using model = xxx'.
+
+    Returns
+    -------
+    str
+        Extracted name from the statement, e.g. 'model' for 'using model = xxx'.
+
+    Raises
+    ------
+    RuntimeError
+        Invalid config.
+    """
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[1]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_profiler(
+    func_attrs: Dict[str, Any],
+    workdir: str,
+    rank: int,
+    shape_eval_template: jinja2.Template,
+    exec_template: jinja2.Template,
+    tensor_decl_template: jinja2.Template,
+    extra_header_template: jinja2.Template,
+    get_func_signature: Any,
+    extra_code: str = "",
+    func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
+    indent: str = "  ",
+) -> str:
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    rank: int
+        Rank of the input tensor. If using [M, N] in exec_key, the rank here
+        must be 2 because if implies that the inputs are reshaped for profiling.
+        For code gen, the real shapes are used.
+    exec_template : jinja2.Template
+        Execution block template.
+    tensor_decl_template: jinja2.Template
+        Tensor declaration template.
+    extra_header_template : jinja2.Template
+        Extra header template.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    """
+    op_type = func_attrs["op"]
+    shape_eval = shape_eval_template.render(rank=rank) if shape_eval_template else ""
+    eps = func_attrs.get("eps", "1e-5")
+
+    op_instance = func_attrs["op_instance"]
+    file_pairs = []
+    for op_name, op in op_instance.items():
+
+        config = emit_instance(op)
+        config_name = extract_config_name(config)
+        instances = INSTANCE_TEMPLATE.render(
+            name="DeviceInstance", config_name=config_name, config=config
+        )
+        exe_path = exec_template.render(
+            instance="DeviceInstance",
+            dtype="void",
+            reduce_dims=rank - 1,
+            rank=rank,
+            eps=eps,
+        )
+
+        op_func = FUNC_TEMPLATE.render(
+            instances_decl=instances,
+            func_signature=get_func_signature(func_attrs),
+            shape_eval=shape_eval,
+            exec_paths=exe_path,
+            extra_headers=extra_header_template.render(),
+            extra_code=extra_code,
+        )
+        structs_def = STRUCTS_DEF_TEMPLATE.render()
+        args_parse = ARGS_PARSE_TEMPLATE.render(rank=rank)
+        tensor_decl = tensor_decl_template.render(rank=rank)
+
+        input_dim_names = [f"in_{i}" for i in range(rank)]
+        func_call = func_call_template.render(
+            func_name=func_attrs["name"],
+            input="(void *) memory_pool->RequestHalfTensorByIdx(0)",
+            gamma="(void *) memory_pool->RequestHalfTensorByIdx(2)",
+            beta="(void *) memory_pool->RequestHalfTensorByIdx(3)",
+            output="(void *) memory_pool->RequestHalfTensorByIdx(1)",
+            input_dim_names=input_dim_names,
+            indent=indent,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            structs_def=structs_def,
+            args_parse=args_parse,
+            tensor_decl=tensor_decl,
+            func_call=func_call,
+        )
+
+        prefix = os.path.join(workdir, "profiler", op_type)
+        if not os.path.exists(prefix):
+            os.makedirs(prefix)
+        src_path = os.path.join(prefix, op_name + ".cpp")
+        obj_path = os.path.join(prefix, op_name)
+        if os.path.exists(obj_path):
+            continue
+        with open(src_path, "w") as fo:
+            fo.write(code)
+        file_pairs.append((src_path, obj_path))
+    return file_pairs
+
+
+# no longer used by layernorm
+def gen_function(
+    func_attrs: Dict[str, Any],
+    exec_template: jinja2.Template,
+    extra_header_template: jinja2.Template,
+    get_func_signature: Any,
+) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_template : jinja2.Template
+        Execution block template.
+    extra_header_template : jinja2.Template
+        Extra header template.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    rank = len(shapes)
+
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(op_instance[algo])
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+
+    exec_cond_template = func_attrs["exec_cond_template"]
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(
+            instance=fname, dtype="void", reduce_dims=rank - 1, rank=rank
+        )
+        cond_vars = re.findall(r"\S+(?= >=)", key)
+        cond_vars += re.findall(r"\S+(?= ==)", key)
+        cond = key
+        for i, var in enumerate(cond_vars):
+            cond = cond.replace(var + " ", "*in_" + str(i))
+        exec_inst = exec_cond_template.render(indent="  ", cond=cond, program=program)
+        exec_paths += exec_inst
+
+    return FUNC_TEMPLATE.render(
+        instances_decl=instance_decl,
+        func_signature=get_func_signature(func_attrs),
+        exec_paths=exec_paths,
+        extra_headers=extra_header_template.render(),
+    )
+
+
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    input_name = FUNC_CALL_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+    output_name = FUNC_CALL_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    input_dim_names = [shape._attrs["name"] for shape in shapes]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input=input_name,
+        output=output_name,
+        input_dim_names=input_dim_names,
+        indent=indent,
+    )
\ No newline at end of file
--- a/python/AIT implementation/sample files/permute_common.py
+++ b/python/AIT implementation/sample files/permute_common.py
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import jinja2
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t stride_a = *a_dim1;
+{{indent}}const int64_t stride_b = *b_dim1;
+{{indent}}const int64_t stride_c = *c_dim1;
+    ck::index_t M0 = M / G1 / G2;
+    ck::index_t M1 = G1;
+    ck::index_t M2 = G2;
+    ck::index_t N0 = G3;
+    ck::index_t N1 = N / G3;
+    // GEMM shape
+    //ck::index_t M = M0 * M1 * M2;
+    //ck::index_t N = N0 * N1;
+    //ck::index_t K = 128;
+    //ck::index_t stride_A = K;
+    //ck::index_t stride_B = K;
+    // E = [M0, N0, M1, N1, M2]
+    /* 0, 3, 1, 4, 2
+    ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
+    ck::index_t stride_E_M1 = N1 * M2;
+    ck::index_t stride_E_M2 = 1;
+    ck::index_t stride_E_N0 = M1 * N1 * M2;
+    ck::index_t stride_E_N1 = M2;
+    */
+    // E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
+    ck::index_t stride_E_M0 = N0* M1* N1;
+    ck::index_t stride_E_M1 = N1;
+    ck::index_t stride_E_M2 = M0* N0* M1* N1;
+    ck::index_t stride_E_N0 = M1 * N1;
+    ck::index_t stride_E_N1 = 1;
+    // D = [0, N0, 0, N1, 0]
+    ck::index_t stride_D_M0 = 0;
+    ck::index_t stride_D_M1 = 0;
+    ck::index_t stride_D_M2 = 0;
+    ck::index_t stride_D_N0 = N1;
+    ck::index_t stride_D_N1 = 1;
+"""
+)
+
+EXTRA_SHAPE_TEMPLATE_M2N3 = jinja2.Template(
+    """
+    const int64_t G1 = p_dim0; // G1
+    const int64_t G2 = p_dim1; // G2
+    const int64_t G3 = p_dim2; // G3
+
+    ck::index_t M0 = M / G1;
+    ck::index_t M1 = G1;
+    ck::index_t N0 = G2;
+    ck::index_t N1 = G3;
+    ck::index_t N2 = N / G2 / G3;
+
+    ck::index_t K0 = K;
+    ck::index_t G = 1;
+
+    // A[G, M0, M1, M2, K0]
+    std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, K0};
+    std::vector<ck::index_t> a_ms_ks_strides{M0*M1*K0, M1 * K0, K0, 1};
+    // B[G, N0, N1, K0]
+    std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, N2, K0};
+    std::vector<ck::index_t> b_ns_ks_strides{N0*N1*N2*K0, N1 * N2 * K0, N2 * K0, K0, 1};
+
+    // D[G, N0, M0, N1, M1, N2]
+    std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> d_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
+
+    // E[G, N0, M0, N1, M1, N2] 2, 0, 3, 1, 4
+    std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> e_ms_ns_strides{M0* M1* N0* N1* N2,
+                                               N1 * M1 * N2,
+                                               N2,
+                                               M0 * N1 * M1 * N2,
+                                               M1 * N2,
+                                               1};
+
+"""
+)
+
+
+EXTRA_SHAPE_TEMPLATE_M3N2 = jinja2.Template(
+    """
+    const int64_t G1 = p_dim0; // G1
+    const int64_t G2 = p_dim1; // G2
+    const int64_t G3 = p_dim2; // G3
+
+    ck::index_t M0 = M / G1 / G2;
+    ck::index_t M1 = G1;
+    ck::index_t M2 = G2;
+    ck::index_t N0 = G3;
+    ck::index_t N1 = N / G3;
+
+    ck::index_t K0 = K;
+    ck::index_t G = 1;
+
+
+    // A[M0, M1, M2, K0]
+    std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, M2, K0};
+    std::vector<ck::index_t> a_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
+    // B[N0, N1, K0]
+    std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, K0};
+    std::vector<ck::index_t> b_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[M0, N0, M1, N1, M2]
+    std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> d_ms_ns_strides{N0*N1, 0, 0, 0, N1, 1};
+    // E[M0, N0, M1, N1, M2]
+    std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> e_ms_ns_strides{M0 * M1* M2 * N1* N0, N0* M1* N1, N1, M0* N0* M1* N1, M1 * N1, 1};
+
+
+"""
+)
\ No newline at end of file