// with input fp8/bf8 (use_fp8) and bf16 output, need a fp32 D_temp, as rocblas does not support this case (fp8/bf8 input fp16/fp32 output is supported)
D_type==rocblas_datatype_bf16_r)||
// with use_fp8 true and fp8/bf8 output, need fp32 D_temp to support amax and scale operation
// with input fp8/bf8 (use_fp8) and bf16 output, need a fp32 D_temp, as rocblas does not support this case (fp8/bf8 input fp16/fp32 output is supported)
B_type,ldb,&beta,D_temp,D_temp_type,ldd,D_temp,
// with use_fp8 true and fp8/bf8 output, need fp32 D_temp to support amax and scale operation