Unverified Commit b9809638 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

Merge pull request #255 from jmorganca/update-llama-cpp

Update llama cpp
parents da52f5bf 74a5f7e6
This diff is collapsed.
/**
* llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
* Copyright (c) 2023 Georgi Gerganov
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
#ifdef __cplusplus
}
#endif
This diff is collapsed.
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -53,6 +53,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); ...@@ -53,6 +53,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void); void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
......
//go:build darwin //go:build darwin
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
//go:build darwin //go:build darwin
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -746,7 +746,8 @@ void ggml_metal_graph_compute( ...@@ -746,7 +746,8 @@ void ggml_metal_graph_compute(
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
GGML_ASSERT(ne02 == ne12); // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
GGML_ASSERT(ne03 == ne13);
if (ggml_is_contiguous(src0) && if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) && ggml_is_contiguous(src1) &&
...@@ -774,11 +775,11 @@ void ggml_metal_graph_compute( ...@@ -774,11 +775,11 @@ void ggml_metal_graph_compute(
initWithDevice:ctx->device transposeLeft:false transposeRight:true initWithDevice:ctx->device transposeLeft:false transposeRight:true
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
// we need to do ne02 multiplications // we need to do ne12 multiplications
// TODO: is there a way to do this in parallel - currently very slow .. // TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for (int64_t i02 = 0; i02 < ne02; ++i02) { for (int64_t i02 = 0; i02 < ne12; ++i02) {
size_t offs_src0_cur = offs_src0 + i02*nb02; size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
size_t offs_src1_cur = offs_src1 + i02*nb12; size_t offs_src1_cur = offs_src1 + i02*nb12;
size_t offs_dst_cur = offs_dst + i02*nb2; size_t offs_dst_cur = offs_dst + i02*nb2;
...@@ -800,8 +801,6 @@ void ggml_metal_graph_compute( ...@@ -800,8 +801,6 @@ void ggml_metal_graph_compute(
switch (src0t) { switch (src0t) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
GGML_ASSERT(ne02 == ne12);
nth0 = 64; nth0 = 64;
nth1 = 1; nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
...@@ -881,16 +880,18 @@ void ggml_metal_graph_compute( ...@@ -881,16 +880,18 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6]; [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9]; [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
......
//go:build darwin //go:build darwin
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -537,11 +537,13 @@ kernel void kernel_mul_mat_f16_f32( ...@@ -537,11 +537,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00, constant uint64_t & nb00,
constant uint64_t & nb01, constant uint64_t & nb01,
constant uint64_t & nb02, constant uint64_t & nb02,
constant int64_t & ne10, constant int64_t & ne10,
constant int64_t & ne11, constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10, constant uint64_t & nb10,
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
...@@ -557,7 +559,7 @@ kernel void kernel_mul_mat_f16_f32( ...@@ -557,7 +559,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z; const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02); device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f; sum[tpitg.x] = 0.0f;
...@@ -580,6 +582,7 @@ kernel void kernel_mul_mat_f16_f32( ...@@ -580,6 +582,7 @@ kernel void kernel_mul_mat_f16_f32(
} }
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
......
//go:build mpi //go:build mpi
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
//go:build mpi //go:build mpi
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
//go:build opencl //go:build opencl
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
//go:build opencl //go:build opencl
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -4585,8 +4585,10 @@ static struct ggml_tensor * ggml_new_tensor_impl( ...@@ -4585,8 +4585,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int n_dims, int n_dims,
const int64_t* ne, const int64_t * ne,
void* data) { void * data) {
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
   
size_t data_size = 0; size_t data_size = 0;
   
...@@ -6264,6 +6266,27 @@ struct ggml_tensor * ggml_reshape_4d( ...@@ -6264,6 +6266,27 @@ struct ggml_tensor * ggml_reshape_4d(
   
// ggml_view_1d // ggml_view_1d
   
static struct ggml_tensor * ggml_view_tensor_offset(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_dims,
const int64_t * ne,
size_t offset) {
// don't calculate an offset from an unallocated tensor
void * data = NULL;
if (a->data != NULL) {
data = (char *) a->data + offset;
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
return result;
}
struct ggml_tensor * ggml_view_1d( struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
...@@ -6276,10 +6299,7 @@ struct ggml_tensor * ggml_view_1d( ...@@ -6276,10 +6299,7 @@ struct ggml_tensor * ggml_view_1d(
is_node = true; is_node = true;
} }
   
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
   
result->op = GGML_OP_VIEW; result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
...@@ -6306,10 +6326,7 @@ struct ggml_tensor * ggml_view_2d( ...@@ -6306,10 +6326,7 @@ struct ggml_tensor * ggml_view_2d(
   
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
   
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
   
result->nb[1] = nb1; result->nb[1] = nb1;
result->nb[2] = result->nb[1]*ne1; result->nb[2] = result->nb[1]*ne1;
...@@ -6342,10 +6359,7 @@ struct ggml_tensor * ggml_view_3d( ...@@ -6342,10 +6359,7 @@ struct ggml_tensor * ggml_view_3d(
   
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
   
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
   
result->nb[1] = nb1; result->nb[1] = nb1;
result->nb[2] = nb2; result->nb[2] = nb2;
...@@ -6380,10 +6394,7 @@ struct ggml_tensor * ggml_view_4d( ...@@ -6380,10 +6394,7 @@ struct ggml_tensor * ggml_view_4d(
   
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
   
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
   
result->nb[1] = nb1; result->nb[1] = nb1;
result->nb[2] = nb2; result->nb[2] = nb2;
...@@ -6767,6 +6778,18 @@ struct ggml_tensor * ggml_rope_inplace( ...@@ -6767,6 +6778,18 @@ struct ggml_tensor * ggml_rope_inplace(
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true); return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
} }
   
struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
int n_ctx,
float freq_base,
float freq_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
}
struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -1196,7 +1196,18 @@ extern "C" { ...@@ -1196,7 +1196,18 @@ extern "C" {
int mode, int mode,
int n_ctx); int n_ctx);
// custom RoPE, in-place, returns view(a) // custom RoPE
GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
int n_ctx,
float freq_base,
float freq_scale);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_custom_inplace( GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -65,6 +65,8 @@ ...@@ -65,6 +65,8 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
// //
// 2-6 bit quantization in super-blocks // 2-6 bit quantization in super-blocks
// //
...@@ -1379,7 +1381,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -1379,7 +1381,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8); const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)}; const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
__m256i sumi = _mm256_setzero_si256(); __m256i sumi = _mm256_setzero_si256();
...@@ -1447,7 +1449,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -1447,7 +1449,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
// sumf += -dmin * summs in 32bits*8 // sumf += -dmin * summs in 32bits*8
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
...@@ -1519,7 +1521,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -1519,7 +1521,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
} }
// sumf += dall * isum - dmin * summs in 32bits // sumf += dall * isum - dmin * summs in 32bits
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
} }
...@@ -1670,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -1670,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
summs += dmin * smin; summs += dmin * smin;
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3); const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3); const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
...@@ -1735,10 +1737,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -1735,10 +1737,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0)); const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1)); const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0)); const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1)); const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2)); const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3)); const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
...@@ -1943,7 +1945,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -1943,7 +1945,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)}; const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
// high bit // high bit
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask); const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
...@@ -2154,7 +2156,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2154,7 +2156,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
} }
// multiply with block scale and accumulate // multiply with block scale and accumulate
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
} }
...@@ -2329,13 +2331,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2329,13 +2331,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
aux16[0] = a & 0x0f0f; aux16[0] = a & 0x0f0f;
aux16[1] = (a >> 4) & 0x0f0f; aux16[1] = (a >> 4) & 0x0f0f;
const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8)); const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8)); const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
memcpy(&aux64, x[i].hmask, 8); memcpy(&aux64, x[i].hmask, 8);
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
__m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux); __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4); __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2); q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2); q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
...@@ -2344,7 +2346,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2344,7 +2346,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
// prepare low and high bits // prepare low and high bits
const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits); const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3); const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3); const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
...@@ -2455,7 +2457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2455,7 +2457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
p16_0 = _mm_add_epi32(p16_0, p16_2); p16_0 = _mm_add_epi32(p16_0, p16_2);
p16_1 = _mm_add_epi32(p16_1, p16_3); p16_1 = _mm_add_epi32(p16_1, p16_3);
__m256i p16 = _mm256_set_m128i(p16_1, p16_0); __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
// multiply with block scale and accumulate // multiply with block scale and accumulate
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
...@@ -2646,7 +2648,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2646,7 +2648,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m); acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
const __m256i scales = _mm256_set_m128i(sc128, sc128); const __m256i scales = MM256_SET_M128I(sc128, sc128);
__m256i sumi = _mm256_setzero_si256(); __m256i sumi = _mm256_setzero_si256();
...@@ -2753,7 +2755,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2753,7 +2755,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
} }
__m256 vd = _mm256_set1_ps(d); __m256 vd = _mm256_set1_ps(d);
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
} }
...@@ -2994,11 +2996,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -2994,11 +2996,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0); const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1); const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2); const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3); const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
} }
...@@ -3186,7 +3188,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -3186,7 +3188,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
summs += dmin * _mm_extract_epi32(hsum, 0); summs += dmin * _mm_extract_epi32(hsum, 0);
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
const __m256i scales = _mm256_set_m128i(sc128, sc128); const __m256i scales = MM256_SET_M128I(sc128, sc128);
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh); const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
__m256i hmask = mone; __m256i hmask = mone;
...@@ -3325,7 +3327,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -3325,7 +3327,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
} }
__m256 vd = _mm256_set1_ps(d); __m256 vd = _mm256_set1_ps(d);
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
} }
...@@ -3488,13 +3490,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -3488,13 +3490,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0])); const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2])); const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
int64_t aux64; int64_t aux64;
memcpy(&aux64, x[i].qh, 8); memcpy(&aux64, x[i].qh, 8);
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64); const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128); const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4); const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4); const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
...@@ -3569,7 +3571,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -3569,7 +3571,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2)); const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3)); const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
} }
...@@ -3951,7 +3953,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -3951,7 +3953,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
} }
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
} }
...@@ -4109,8 +4111,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -4109,8 +4111,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4); const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4); const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1); const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
...@@ -4203,7 +4205,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri ...@@ -4203,7 +4205,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
} }
*s = hsum_float_8(acc); *s = hsum_float_8(acc);
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -82,8 +82,14 @@ ...@@ -82,8 +82,14 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
#include "ggml-alloc.h"
#define LLAMA_USE_ALLOCATOR
#else
#define LLAMA_USE_SCRATCH #define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16 #define LLAMA_MAX_SCRATCH_BUFFERS 16
#endif
// available llama models // available llama models
enum e_model { enum e_model {
...@@ -353,13 +359,22 @@ struct llama_model { ...@@ -353,13 +359,22 @@ struct llama_model {
struct llama_context { struct llama_context {
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
#ifdef GGML_USE_METAL
~llama_context() { ~llama_context() {
if (model_owner) {
delete &model;
}
#ifdef GGML_USE_METAL
if (ctx_metal) { if (ctx_metal) {
ggml_metal_free(ctx_metal); ggml_metal_free(ctx_metal);
} }
#endif
#ifdef LLAMA_USE_ALLOCATOR
if (alloc) {
ggml_allocr_free(alloc);
} }
#endif #endif
}
std::mt19937 rng; std::mt19937 rng;
bool has_evaluated_once = false; bool has_evaluated_once = false;
...@@ -397,7 +412,17 @@ struct llama_context { ...@@ -397,7 +412,17 @@ struct llama_context {
// memory buffers used to evaluate the model // memory buffers used to evaluate the model
// TODO: move in llama_state // TODO: move in llama_state
llama_ctx_buffer buf_compute; llama_ctx_buffer buf_compute;
#ifdef LLAMA_USE_ALLOCATOR
llama_ctx_buffer buf_alloc;
ggml_allocr * alloc = NULL;
#endif
#ifdef LLAMA_USE_SCRATCH
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL; ggml_metal_context * ctx_metal = NULL;
...@@ -407,9 +432,6 @@ struct llama_context { ...@@ -407,9 +432,6 @@ struct llama_context {
ggml_mpi_context * ctx_mpi = NULL; ggml_mpi_context * ctx_mpi = NULL;
#endif #endif
int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
void use_buf(struct ggml_context * ctx, int i) { void use_buf(struct ggml_context * ctx, int i) {
#if defined(LLAMA_USE_SCRATCH) #if defined(LLAMA_USE_SCRATCH)
size_t last_size = 0; size_t last_size = 0;
...@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() { ...@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() {
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false, /*.low_vram =*/ false,
/*.mul_mat_q =*/ false,
/*.f16_kv =*/ true, /*.f16_kv =*/ true,
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
...@@ -1032,6 +1055,7 @@ static void llama_model_load_internal( ...@@ -1032,6 +1055,7 @@ static void llama_model_load_internal(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
const bool mul_mat_q,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,
...@@ -1160,9 +1184,11 @@ static void llama_model_load_internal( ...@@ -1160,9 +1184,11 @@ static void llama_model_load_internal(
} }
(void) main_gpu; (void) main_gpu;
(void) mul_mat_q;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_main_device(main_gpu);
ggml_cuda_set_mul_mat_q(mul_mat_q);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
...@@ -1256,12 +1282,16 @@ static void llama_model_load_internal( ...@@ -1256,12 +1282,16 @@ static void llama_model_load_internal(
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference // this is the total memory required to run the inference
const size_t mem_required = size_t mem_required =
ctx_size + ctx_size +
mmapped_size - vram_weights + // weights in VRAM not in memory mmapped_size - vram_weights; // weights in VRAM not in memory
#ifndef LLAMA_USE_ALLOCATOR
mem_required +=
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type); MEM_REQ_EVAL().at(model.type);
#endif
// this is the memory required by one llama_state // this is the memory required by one llama_state
const size_t mem_required_state = const size_t mem_required_state =
...@@ -1367,6 +1397,7 @@ static bool llama_model_load( ...@@ -1367,6 +1397,7 @@ static bool llama_model_load(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
const bool mul_mat_q,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,
...@@ -1377,7 +1408,8 @@ static bool llama_model_load( ...@@ -1377,7 +1408,8 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
...@@ -1386,32 +1418,15 @@ static bool llama_model_load( ...@@ -1386,32 +1418,15 @@ static bool llama_model_load(
} }
} }
// evaluate the transformer static struct ggml_cgraph * llama_build_graph(
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - embd embeddings input
// - n_tokens number of tokens
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static bool llama_eval_internal(
llama_context & lctx, llama_context & lctx,
const llama_token * tokens, const llama_token * tokens,
const float * embd, const float * embd,
int n_tokens, int n_tokens,
int n_past, int n_past) {
int n_threads,
const char * cgraph_fname) {
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
#ifdef GGML_USE_MPI
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
#endif
const int64_t t_start_us = ggml_time_us();
const int N = n_tokens; const int N = n_tokens;
const auto & model = lctx.model; const auto & model = lctx.model;
...@@ -1427,10 +1442,8 @@ static bool llama_eval_internal( ...@@ -1427,10 +1442,8 @@ static bool llama_eval_internal(
const int64_t n_head = hparams.n_head; const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_vocab = hparams.n_vocab;
const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_gqa = hparams.n_embd_gqa();
LLAMA_ASSERT(n_embd_head == hparams.n_rot); LLAMA_ASSERT(n_embd_head == hparams.n_rot);
const float freq_base = hparams.rope_freq_base; const float freq_base = hparams.rope_freq_base;
...@@ -1442,26 +1455,35 @@ static bool llama_eval_internal( ...@@ -1442,26 +1455,35 @@ static bool llama_eval_internal(
auto & mem_per_token = lctx.mem_per_token; auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute; auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size, /*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.addr, /*.mem_buffer =*/ buf_compute.addr,
/*.no_alloc =*/ false, /*.no_alloc =*/ false,
}; };
#ifdef LLAMA_USE_ALLOCATOR
params.no_alloc = true;
#endif
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph * gf = ggml_new_graph(ctx0); ggml_cgraph * gf = ggml_new_graph(ctx0);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
if (tokens) { if (tokens) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc(lctx.alloc, inp_tokens);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
}
#else
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
#endif
ggml_set_name(inp_tokens, "inp_tokens"); ggml_set_name(inp_tokens, "inp_tokens");
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
...@@ -1471,8 +1493,16 @@ static bool llama_eval_internal( ...@@ -1471,8 +1493,16 @@ static bool llama_eval_internal(
#endif #endif
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc(lctx.alloc, inpL);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
} }
#else
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
#endif
}
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
(void) i_gpu_start; (void) i_gpu_start;
...@@ -1498,6 +1528,17 @@ static bool llama_eval_internal( ...@@ -1498,6 +1528,17 @@ static bool llama_eval_internal(
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
#else
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
#endif
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
ggml_format_name(inpL, "layer_inp_%d", il); ggml_format_name(inpL, "layer_inp_%d", il);
...@@ -1593,9 +1634,6 @@ static bool llama_eval_internal( ...@@ -1593,9 +1634,6 @@ static bool llama_eval_internal(
ggml_set_name(KQ, "KQ"); ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled = KQ / sqrt(n_embd_head)
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
// KQ_scaled shape [n_past + N, N, n_head, 1] // KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
offload_func_kq(KQ_scaled); offload_func_kq(KQ_scaled);
...@@ -1711,9 +1749,6 @@ static bool llama_eval_internal( ...@@ -1711,9 +1749,6 @@ static bool llama_eval_internal(
lctx.use_buf(ctx0, 0); lctx.use_buf(ctx0, 0);
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL;
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
...@@ -1724,8 +1759,6 @@ static bool llama_eval_internal( ...@@ -1724,8 +1759,6 @@ static bool llama_eval_internal(
cur = ggml_mul(ctx0, cur, model.norm); cur = ggml_mul(ctx0, cur, model.norm);
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name(cur, "result_norm"); ggml_set_name(cur, "result_norm");
embeddings = cur;
} }
// lm_head // lm_head
...@@ -1737,12 +1770,88 @@ static bool llama_eval_internal( ...@@ -1737,12 +1770,88 @@ static bool llama_eval_internal(
// logits -> probs // logits -> probs
//cur = ggml_soft_max_inplace(ctx0, cur); //cur = ggml_soft_max_inplace(ctx0, cur);
// run the computation
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
#if 0
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif
ggml_free(ctx0);
return gf;
}
// evaluate the transformer
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - embd embeddings input
// - n_tokens number of tokens
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static bool llama_eval_internal(
llama_context & lctx,
const llama_token * tokens,
const float * embd,
int n_tokens,
int n_past,
int n_threads,
const char * cgraph_fname) {
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
const int64_t t_start_us = ggml_time_us();
#ifdef GGML_USE_MPI
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
#endif
const int N = n_tokens;
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & kv_self = lctx.kv_self;
LLAMA_ASSERT(!!kv_self.ctx);
const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab;
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_reset(lctx.alloc);
#endif
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc_graph(lctx.alloc, gf);
#endif
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
#if GGML_USE_MPI #if GGML_USE_MPI
const int64_t n_layer = hparams.n_layer;
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
#endif #endif
...@@ -1754,7 +1863,10 @@ static bool llama_eval_internal( ...@@ -1754,7 +1863,10 @@ static bool llama_eval_internal(
//} //}
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur); ggml_metal_get_tensor (lctx.ctx_metal, res);
if (!lctx.embedding.empty()) {
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
}
} else { } else {
// IMPORTANT: // IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
...@@ -1785,8 +1897,6 @@ static bool llama_eval_internal( ...@@ -1785,8 +1897,6 @@ static bool llama_eval_internal(
// update kv token count // update kv token count
lctx.kv_self.n = n_past + N; lctx.kv_self.n = n_past + N;
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
if (cgraph_fname) { if (cgraph_fname) {
ggml_graph_export(gf, cgraph_fname); ggml_graph_export(gf, cgraph_fname);
} }
...@@ -1824,21 +1934,6 @@ static bool llama_eval_internal( ...@@ -1824,21 +1934,6 @@ static bool llama_eval_internal(
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
} }
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
#if 0
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif
ggml_free(ctx0);
// measure the performance only for the single-token evals // measure the performance only for the single-token evals
if (N == 1) { if (N == 1) {
lctx.t_eval_us += ggml_time_us() - t_start_us; lctx.t_eval_us += ggml_time_us() - t_start_us;
...@@ -1950,7 +2045,9 @@ struct llama_tokenizer { ...@@ -1950,7 +2045,9 @@ struct llama_tokenizer {
if (token == vocab_.token_to_id.end()) { if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes. // output any symbols that did not form tokens as bytes.
for (int j = 0; j < (int) symbol.n; ++j) { for (int j = 0; j < (int) symbol.n; ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3; // NOTE: old version, before #2420 - not sure what are the implications of this
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
output.push_back(token_id); output.push_back(token_id);
} }
} else { } else {
...@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file( ...@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) { params.progress_callback_user_data)) {
delete model; delete model;
...@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model( ...@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
#ifdef LLAMA_USE_ALLOCATOR
{
static const size_t tensor_alignment = 32;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
// create measure allocator
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
// build worst-case graph
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
int n_past = hparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
// measure memory requirements for the graph
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
// debug - for comparison with scratch buffer
//size_t prev_req =
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
// MEM_REQ_EVAL().at(ctx->model.type);
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements
ggml_allocr_free(ctx->alloc);
ctx->buf_alloc.resize(alloc_size);
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
}
#else
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
#endif
#ifdef LLAMA_USE_SCRATCH
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
#endif
} }
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
...@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file( ...@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file(
} }
void llama_free(struct llama_context * ctx) { void llama_free(struct llama_context * ctx) {
if (ctx->model_owner) {
delete &ctx->model;
}
delete ctx; delete ctx;
} }
......
...@@ -128,6 +128,11 @@ func New(model string, opts api.Options) (*LLM, error) { ...@@ -128,6 +128,11 @@ func New(model string, opts api.Options) (*LLM, error) {
C.llama_backend_init(C.bool(llm.UseNUMA)) C.llama_backend_init(C.bool(llm.UseNUMA))
// TODO: GQA == 8 suggests 70B model which doesn't support metal
if llm.NumGQA == 8 {
llm.NumGPU = 0
}
params := C.llama_context_default_params() params := C.llama_context_default_params()
params.seed = C.uint(llm.Seed) params.seed = C.uint(llm.Seed)
params.n_ctx = C.int(llm.NumCtx) params.n_ctx = C.int(llm.NumCtx)
......
/** /**
* llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2 * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
* *
* MIT License * MIT License
* *
...@@ -134,6 +134,7 @@ extern "C" { ...@@ -134,6 +134,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool low_vram; // if true, reduce VRAM usage at the cost of performance bool low_vram; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
bool f16_kv; // use fp16 for KV cache bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment