#pragma once

#include "src/fastertransformer/ck_extensions/include/ck_extensions/epilogue/epilogue_quant_helper.h"
#include "src/fastertransformer/ck_extensions/include/ck_extensions/ft_gemm_configs.h"
// #include "hip/hip_runtime.h"

using cutlass::epilogue::QuantMode;
namespace fastertransformer {

/*
  This runner supports:
  int8_t inputs (A and B)
  float alpha scalings (either per-col, or per-col x per-row)
  T output (D) where T = {float, half, __nv_bfloat16} // TODO(mseznec)

  Activations, biases, scales and outputs are all assumed to be row-major.
  Weights are assumed to be column-major.
*/

template<typename T>
class CutlassInt8GemmRunner {
public:
    CutlassInt8GemmRunner();
    ~CutlassInt8GemmRunner();

    void gemm(const int8_t* A,
              const int8_t* B,
              QuantMode     quant_mode,
              const float*  alpha_col,
              const float*  alpha_row,
              T*            C,
              int           m,
              int           n,
              int           k,
              char*         workspace_ptr,
              const size_t  workspace_bytes
              );
              // hipStream_t  stream);

    // Returns desired workspace size in bytes.
    int getWorkspaceSize(const int m, const int n, const int k);

private:
    static constexpr int split_k_limit = 7;

    int multi_processor_count_;
};

}  // namespace fastertransformer
