minor fix

7277329e · Jing Zhang · root · c38d8fdc · 7277329e · 7277329e
Commit 7277329e authored Apr 22, 2023 by Jing Zhang Committed by root Apr 22, 2023
3 changed files
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -147,6 +147,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 #else
        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+	c_tensors_device[i]->SetZero();
 #endif
        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -533,9 +533,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            return;
        }
-        const index_t block_m_id = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
-        const index_t block_n_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I2]);
-        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const index_t k_batch_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
        // HACK: this force m/n_block_data_idx_on_grid into SGPR
        const index_t m_block_data_idx_on_grid =

--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -98,8 +98,6 @@ bool profile_grouped_gemm_impl(int do_verification,
            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
        }
-        c_m_n_device_results[i].GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
    }
    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -134,13 +132,12 @@ bool profile_grouped_gemm_impl(int do_verification,
            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
        b_device_buf.emplace_back(
            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
-        c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
+        c_device_buf[i]->SetZero();
        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});