#ifndef xpu_ops_H #define xpu_ops_H #include #include #include #include #include #include #include template static inline void sycl_kernel_submit(sycl::nd_range range, sycl::queue q, ker_t ker) { auto cgf = [&](::sycl::handler& cgh) [[sycl::reqd_sub_group_size(subgroup_size)]] { cgh.parallel_for(range, ker); }; q.submit(cgf); } template static inline void sycl_comp_kernel_submit(sycl::nd_range range, sycl::queue q, ker_t ker) { auto cgf = [&](::sycl::handler& cgh) [[sycl::reqd_sub_group_size(subgroup_size)]] { ker.sycl_ker_local_memory_creation(cgh); cgh.parallel_for(range, ker); }; q.submit(cgf); } typedef enum DataType_t { General8bit = 0, FP4 = 1, NF4 = 2, } DataType_t; template void dequantizeBlockwise( float* code, unsigned char* A, float* absmax, T* out, int workgroup_size, const int n, sycl::queue* stream ); template void gemv_4bit_inference( int m, int n, int k, T* A, unsigned char* B, float* absmax, float* datatype, T* out, int lda, int ldb, int ldc, int blocksize, sycl::queue* stream ); #endif