Temp save

32bac6f3 · aska-0096 · 66e61076 · 32bac6f3 · 32bac6f3 · 32bac6f3
Commit 32bac6f3 authored Aug 01, 2023 by aska-0096
3 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp
@@ -419,7 +419,7 @@ struct Blockwise_fpAintB_GemmWMMA
                        // convert B from int8 to fp16, multiply scale
                        static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) {
                            converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] *
-                                                        type_convert<ADataType>(b_thread_buf[i]);
+                                                        type_convert<ADataType>(b_thread_buf[i]); // call byte permute
                        });
                        // read A
                        a_thread_copy_.Run(

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1143,7 +1143,9 @@ struct ThreadwiseTensorSliceTransfer_v4
            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
                src_desc, src_data_coord);
+#if 0
+            printf("Tid: %03d, LDS read offset: %d\n", get_thread_local_1d_id(), src_data_coord.GetOffset());
+#endif 
            // copy data from src_buf into src_tmp_vector
            if constexpr(SrcBuffer::IsDynamicBuffer())
            {
@@ -1417,10 +1419,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
                                                    1,
                                                    0);
                v_theother_row = type_convert_sp<SrcData>(temp);
-                // if (get_thread_local_1d_id() == 0){
-                //                 printf("src_offset:%d, dst_offset for this row: %d, dst_offset
-                //                 for the other row: %d \n",
-                //                         src_offset, dst_offset, dst_offset+DstScalarPerVector);}
                if(get_thread_local_1d_id() % 32 < 16)
                {
                    // apply type convert

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -207,7 +207,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            // copy data from src_buf into src_vector_container
            auto src_vector_container = src_vector_type{
                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
+            if (false){
+                printf("Tid: %03d, a_grid_buf: %04x\n",
+                        get_thread_local_1d_id(),
+                        *(reinterpret_cast<const uint16_t*>(&src_vector_container.template AsType<SrcData>()[Number<0>{}])));
+            }
            // copy data from src_vector_container into src_thread_scratch_
            src_thread_scratch_tuple_(thread_scratch_id)
                .template SetAsType<src_vector_t>(
@@ -442,7 +446,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            const bool is_dst_valid =
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+#if 0
+            printf("Tid: %03d, LDS write offset: %d\n", get_thread_local_1d_id(), dst_coord_.GetOffset());
+#endif
            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
            using dst_vector_t    = typename dst_vector_type::type;