import torch def ggml_dequantize( weight: torch.Tensor, quant_type: int, M: int, N: int, dtype: torch.dtype ): assert M > 0 and N > 0, "GGUF weight Input shape must be of positive dimensions" return torch.ops.sgl_kernel.ggml_dequantize.default(weight, quant_type, M, N, dtype) def ggml_mul_mat_vec_a8( weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int ) -> torch.Tensor: return torch.ops.sgl_kernel.ggml_mul_mat_vec_a8.default(weight, x, quant_type, row) def ggml_mul_mat_a8( weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int ) -> torch.Tensor: return torch.ops.sgl_kernel.ggml_mul_mat_a8.default(weight, x, quant_type, row) def ggml_moe_a8( input: torch.Tensor, weight: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, num_token_post_padded: torch.Tensor, type: int, row: int, topk: int, tokens: int, ) -> torch.Tensor: return torch.ops.sgl_kernel.ggml_moe_a8.default( input, weight, sorted_token_ids, expert_ids, num_token_post_padded, type, row, topk, tokens, ) def ggml_moe_a8_vec( input: torch.Tensor, weight: torch.Tensor, topk_ids: torch.Tensor, top_k: int, type: int, row: int, tokens: int, ) -> torch.Tensor: return torch.ops.sgl_kernel.ggml_moe_a8_vec.default( input, weight, topk_ids, top_k, type, row, tokens ) def ggml_moe_get_block_size(type: int) -> int: return torch.ops.sgl_kernel.ggml_moe_get_block_size.default(type)