Unverified Commit c68f367e authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Update GGML to b6646 (#12245)

Notable EOLs with this change:
- MacOS v12 and v13 are no longer supported (v14+ required)
- AMD gfx900 and gfx906 are no longer supported
parent fdb10946
#include "reduce_rows.cuh"
#include "sumrows.cuh" #include "sumrows.cuh"
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1); const int id = ggml_cuda_get_device();
const int nsm = ggml_cuda_info().devices[id].nsm;
const dim3 block_nums(nrows, 1, 1); const dim3 block_nums(nrows, 1, 1);
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); if ((nrows / nsm) < 2) {
const dim3 block_dims(512, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
} else {
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
}
} }
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...@@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ...@@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ncols = src0->ne[0]; const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1); const dim3 block_nums(nrows, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols); const int id = ggml_cuda_get_device();
const int nsm = ggml_cuda_info().devices[id].nsm;
if ((nrows / nsm) < 2) {
// Increase num threads to 512 for small nrows to better hide the latency
const dim3 block_dims(512, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
} else {
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
}
} }
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment