// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n",static_cast<int>(idx_1d.value),src_vector_container.templateAsType<SrcData>().At(Number<0>{}));
printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n",static_cast<int>(blockIdx.x),static_cast<int>(idx_1d.value),static_cast<float>(src_vector_container.templateAsType<SrcData>().At(Number<0>{})));
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
// printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
// }
// Emin @added
__syncthreads();
// Debug: Print SrcData before and after applying element-wise operation
if(threadIdx.x==0&&threadIdx.y==0){
// printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n",static_cast<int>(idx_1d.value),static_cast<int>(i.value),src_vector_container.templateAsType<SrcData>().At(Number<i>{}));
// printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{}));
// // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
// printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n" , static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v);
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %f \n",static_cast<int>(blockIdx.x),static_cast<int>(idx_1d.value),static_cast<int>(i.value),static_cast<float>(src_vector_container.templateAsType<SrcData>().At(Number<i>{})));
// printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
printf("Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu\n",static_cast<int>(idx_1d.value),static_cast<int>(i.value),v);
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f\n",static_cast<int>(blockIdx.x),static_cast<int>(idx_1d.value),static_cast<int>(i.value),static_cast<float>(v));
// printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>()[i]));
// printf("DstData after type conversion at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
printf("Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n",static_cast<int>(idx_1d.value),static_cast<int>(i.value),dst_vector_container.templateAsType<DstData>().At(Number<i>{}));
// printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{}));
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %f \n",static_cast<int>(blockIdx.x),static_cast<int>(idx_1d.value),static_cast<int>(i.value),static_cast<float>(dst_vector_container.templateAsType<DstData>().At(Number<i>{})));