// these are not used and make no sense, but the compiler needs them
// these are not used and make no sense, but the compiler needs them
//template __global__ void gemm_device<float, 16, 128>(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc);
//template __global__ void gemm_device<float, 16, 128>(int M, int N, int K, float * __restrict__ const A, float* B, float * out, int lda, int ldb, int ldc);
...
@@ -3611,9 +3576,6 @@ template __global__ void gemm_device<half, 16, 96>(int M, int N, int K, half * _
...
@@ -3611,9 +3576,6 @@ template __global__ void gemm_device<half, 16, 96>(int M, int N, int K, half * _
//template __global__ void kMatmul_inference_4bit<NF4, half, half, half>(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB);