fix data type issue, now met fast_numeric_converter call issue

fdfb2f61 · root · c2a77a07 · fdfb2f61 · fdfb2f61
Commit fdfb2f61 authored Sep 02, 2024 by root
2 changed files
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_b_scale.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_b_scale.cpp
@@ -80,7 +80,8 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_BScale_X
          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
          1,    2,  S<1, 32, 1, 8>,  S<8, 8, 1>,
-          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
+        //   ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
+          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
 // clang-format on
 int main(int argc, char* argv[])

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_b_scale.hpp
@@ -798,24 +798,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1_b_scale
    private:
    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
-    static constexpr auto src_oob_thread_scratch_desc_ =
-        decltype(GetSrcThreadScratchDescriptor()){};
    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
    using SrcThreadScratch =
        StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
-                                        DstData, // apply data_convert with SrcThreadScratch
+                                        SrcData, // apply data_convert with SrcThreadScratch
                                        SrcScalarPerVector,
                                        decltype(src_thread_scratch_desc_),
                                        true>;
-    using SrcOOBThreadScratch =
-        StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
-                                        bool, // apply data_convert with SrcThreadScratch
-                                        1,
-                                        decltype(src_oob_thread_scratch_desc_),
-                                        true>;
    // Registers, contain fast converted data
    using SrcThreadConvertedScratch =
        StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
@@ -834,7 +825,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1_b_scale
        FastNumericArrayConverter<SrcData, DstData, SrcScalarPerVector>;
    StaticallyIndexedArray<SrcThreadScratch, NumThreadScratch> src_thread_scratch_tuple_;
-    StaticallyIndexedArray<SrcOOBThreadScratch, NumThreadScratch> src_oob_thread_scratch_tuple_;
    SrcThreadConvertedScratch src_converted_thread_scratch_;
    DstThreadScratch dst_thread_scratch_;