"tests/pipelines/vscode:/vscode.git/clone" did not exist on "bbe8d3ae1330ecc1b90a9873b60d290a83254989"
Commit 9fed0ade authored by Jing Zhang's avatar Jing Zhang
Browse files

weight permute with splitki

parent 35d8627b
...@@ -39,6 +39,9 @@ using DeviceGemmV2Instance = ...@@ -39,6 +39,9 @@ using DeviceGemmV2Instance =
2, 32, 32, 1, 2, 32, 32, 1,
1, 1, S<1, 16, 1, 4>, 4, 1, 1, S<1, 16, 1, 4>, 4,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>; ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
static int NPerBlock = 16;
static int KPerBlock = 256;
#else #else
128, 128,
16, 32, 16, 32,
...@@ -51,8 +54,11 @@ using DeviceGemmV2Instance = ...@@ -51,8 +54,11 @@ using DeviceGemmV2Instance =
2, 32, 32, 0, 2, 32, 32, 0,
1, 1, S<1, 16, 1, 8>, 4, 1, 1, S<1, 16, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>; ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
static int NPerBlock = 32;
static int KPerBlock = 128;
#endif #endif
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -146,30 +152,37 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -146,30 +152,37 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
#if 1 //weight permute
int NPerBlock = 32; #if 0
int KPerBlock = 128;
int N1 = NPerBlock; int N1 = NPerBlock;
int K1 = KPerBlock; int K1 = KPerBlock;
int N0 = N / N1; int N0 = N / N1;
int K0 = K / K1; int K0 = K / K1;
int K01 = K0 / KBatch;
int K00 = KBatch;
std::cout << "K00 = " << K00 << " K01 = " << K01 << std::endl;
for(int i = 0; i < N0; i++) for(int k = 0; k < K00; k++)
{ {
for(int j = 0; j < K0; j++) for(int i = 0; i < N0; i++)
{ {
for(int ii = 0; ii < N1; ii++) for(int j = 0; j < K01; j++)
{ {
for(int jj = 0; jj < K1; jj++) for(int ii = 0; ii < N1; ii++)
{ {
b_k_n_permute(i * K0 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) = for(int jj = 0; jj < K1; jj++)
b_k_n((i * N1 + ii) * K + (j * K1 + jj)); {
b_k_n_permute(k * N0 * K01 * N1 * K1 + i * K01 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
b_k_n((i * N1 + ii) * K + (k * K01 * K1 + j * K1 + jj));
}
} }
} }
} }
} }
#else #else
for(int i = 0; i < N; i++) for(int i = 0; i < N; i++)
{ {
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
//#define WEIGHT_PERMUTE
namespace ck { namespace ck {
// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
...@@ -387,8 +389,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 ...@@ -387,8 +389,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
} }
else else
{ {
// B Tile Permute // Weight Tile Permute
#if 0 #ifndef WEIGHT_PERMUTE
// not pad N or K // not pad N or K
const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor( const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
b_grid_desc_nraw_kraw, b_grid_desc_nraw_kraw,
...@@ -619,10 +621,10 @@ struct GridwiseGemm_xdl_cshuffle_v3 ...@@ -619,10 +621,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
} }
else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>) else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
{ {
#if 0 #ifndef WEIGHT_PERMUTE
b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize; b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
#else #else
const int k0_offset = karg.KRead * NPerBlock; const int k0_offset = karg.KRead * karg.N;
b_k_split_offset = blockIdx.z * k0_offset / BPackedSize; b_k_split_offset = blockIdx.z * k0_offset / BPackedSize;
#endif #endif
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment