Commit 9fed0ade authored by Jing Zhang's avatar Jing Zhang
Browse files

weight permute with splitki

parent 35d8627b
......@@ -39,6 +39,9 @@ using DeviceGemmV2Instance =
2, 32, 32, 1,
1, 1, S<1, 16, 1, 4>, 4,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
static int NPerBlock = 16;
static int KPerBlock = 256;
#else
128,
16, 32,
......@@ -51,8 +54,11 @@ using DeviceGemmV2Instance =
2, 32, 32, 0,
1, 1, S<1, 16, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
static int NPerBlock = 32;
static int KPerBlock = 128;
#endif
// clang-format on
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
......@@ -146,30 +152,37 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
#if 1
int NPerBlock = 32;
int KPerBlock = 128;
//weight permute
#if 0
int N1 = NPerBlock;
int K1 = KPerBlock;
int N0 = N / N1;
int K0 = K / K1;
int K01 = K0 / KBatch;
int K00 = KBatch;
std::cout << "K00 = " << K00 << " K01 = " << K01 << std::endl;
for(int k = 0; k < K00; k++)
{
for(int i = 0; i < N0; i++)
{
for(int j = 0; j < K0; j++)
for(int j = 0; j < K01; j++)
{
for(int ii = 0; ii < N1; ii++)
{
for(int jj = 0; jj < K1; jj++)
{
b_k_n_permute(i * K0 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
b_k_n((i * N1 + ii) * K + (j * K1 + jj));
b_k_n_permute(k * N0 * K01 * N1 * K1 + i * K01 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
b_k_n((i * N1 + ii) * K + (k * K01 * K1 + j * K1 + jj));
}
}
}
}
}
#else
for(int i = 0; i < N; i++)
{
......
......@@ -14,6 +14,8 @@
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
//#define WEIGHT_PERMUTE
namespace ck {
// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
......@@ -387,8 +389,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
}
else
{
// B Tile Permute
#if 0
// Weight Tile Permute
#ifndef WEIGHT_PERMUTE
// not pad N or K
const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
b_grid_desc_nraw_kraw,
......@@ -619,10 +621,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
}
else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
{
#if 0
#ifndef WEIGHT_PERMUTE
b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
#else
const int k0_offset = karg.KRead * NPerBlock;
const int k0_offset = karg.KRead * karg.N;
b_k_split_offset = blockIdx.z * k0_offset / BPackedSize;
#endif
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment