debugging

43a20997 · aska-0096 · 0cd587d9 · 43a20997 · 43a20997 · 43a20997
Commit 43a20997 authored Dec 02, 2022 by aska-0096
6 changed files
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -32,10 +32,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    {
    case 0: break;
    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(),
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k.begin(), a_m_k.end());
-                                                                             a_m_k.end());
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n.begin(), b_k_n.end());
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
-                                                                             b_k_n.end());
        break;
    default:
        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end());
@@ -102,7 +100,6 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
        return true;
    }
    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
    std::size_t flop = 2_uz * M * N * K;

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -226,6 +226,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3
                    constexpr index_t c_offset =
                        c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0));
+                    // debug_hexprinter(0x3c003c00, a_thread_vec.template AsType<FloatAB>()(Number<0>{}));
                    wmma_gemm.template Run(
                            a_thread_vec.template AsType<wmma_input_type>()(Number<0>{}),
                            b_thread_vec.template AsType<wmma_input_type>()(Number<0>{}),

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -359,8 +359,6 @@ struct DeviceGemmWmma : public DeviceGemm<ALayout,
                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                    true>; // Last Option is W/O 
-                std::cout<<"Host kernel type is "<< type_name<decltype(kernel)>()<<std::endl;
-                printf("---------------------Crush before kernel launch-------------------\n");
                ave_time = launch_and_time_kernel(stream_config,
                                                  kernel,
                                                  dim3(grid_size),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -356,6 +356,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
 /*******************************************************************************/
 // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy
        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+        printf("A_GRID_DESC: %s \n", std::string(type_name<decltype(a_grid_desc_k0_m_k1)>()).c_str());
        constexpr auto max_lds_align = K1;
        constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
        constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
@@ -457,7 +458,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
        // gridwise GEMM pipeline
        const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
                                                          a_block_desc_k0perblock_mperblock_k1,
                                                          a_blockwise_copy,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -208,6 +208,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
            // apply SrcElementwiseOperation on src_vector_container
+            debug_hexprinter(0xffffffff, src_coord_.GetOffset());
            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
                SrcData src_v;

--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -72,3 +72,11 @@ constexpr auto type_name() {
  name.remove_suffix(suffix.size());
  return name;
 }
+template <typename T>
+__device__
+void debug_hexprinter(const uint32_t v_target, T v_val){
+    const uint32_t v_dbg = *(reinterpret_cast<uint32_t*>(&v_val));
+    if(v_dbg != v_target)
+        printf("@Thread: %d, Val: %08x != Target: %08x\n", ck::get_thread_local_1d_id(), v_dbg, v_target);
+}