Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
7a1c3e62
Commit
7a1c3e62
authored
Aug 01, 2023
by
Michael Yang
Browse files
update llama.cpp
parent
da52f5bf
Changes
18
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
2683 additions
and
573 deletions
+2683
-573
llama/ggml-alloc.c
llama/ggml-alloc.c
+567
-0
llama/ggml-alloc.h
llama/ggml-alloc.h
+48
-0
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+1740
-418
llama/ggml-cuda.h
llama/ggml-cuda.h
+2
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+18
-17
llama/ggml-metal.metal
llama/ggml-metal.metal
+5
-2
llama/ggml-mpi.c
llama/ggml-mpi.c
+1
-1
llama/ggml-mpi.h
llama/ggml-mpi.h
+1
-1
llama/ggml-opencl.cpp
llama/ggml-opencl.cpp
+1
-1
llama/ggml-opencl.h
llama/ggml-opencl.h
+1
-1
llama/ggml.c
llama/ggml.c
+50
-27
llama/ggml.h
llama/ggml.h
+13
-2
llama/k_quants.c
llama/k_quants.c
+33
-31
llama/k_quants.h
llama/k_quants.h
+1
-1
llama/llama-util.h
llama/llama-util.h
+1
-1
llama/llama.cpp
llama/llama.cpp
+198
-67
llama/llama.h
llama/llama.h
+2
-1
No files found.
llama/ggml-alloc.c
0 → 100644
View file @
7a1c3e62
This diff is collapsed.
Click to expand it.
llama/ggml-alloc.h
0 → 100644
View file @
7a1c3e62
/**
* llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
*
* MIT License
*
* Copyright (c) 2023 Georgi Gerganov
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include "ggml.h"
#ifdef __cplusplus
extern
"C"
{
#endif
GGML_API
struct
ggml_allocr
*
ggml_allocr_new
(
void
*
data
,
size_t
size
,
size_t
alignment
);
GGML_API
struct
ggml_allocr
*
ggml_allocr_new_measure
(
size_t
alignment
);
GGML_API
void
ggml_allocr_free
(
struct
ggml_allocr
*
alloc
);
GGML_API
bool
ggml_allocr_is_measure
(
struct
ggml_allocr
*
alloc
);
GGML_API
void
ggml_allocr_reset
(
struct
ggml_allocr
*
alloc
);
GGML_API
void
ggml_allocr_alloc
(
struct
ggml_allocr
*
alloc
,
struct
ggml_tensor
*
tensor
);
GGML_API
size_t
ggml_allocr_alloc_graph
(
struct
ggml_allocr
*
alloc
,
struct
ggml_cgraph
*
graph
);
#ifdef __cplusplus
}
#endif
llama/ggml-cuda.cu
View file @
7a1c3e62
This diff is collapsed.
Click to expand it.
llama/ggml-cuda.h
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -53,6 +53,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
...
@@ -53,6 +53,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void
ggml_cuda_assign_buffers_no_scratch
(
struct
ggml_tensor
*
tensor
);
void
ggml_cuda_assign_buffers_no_scratch
(
struct
ggml_tensor
*
tensor
);
void
ggml_cuda_assign_buffers_force_inplace
(
struct
ggml_tensor
*
tensor
);
void
ggml_cuda_assign_buffers_force_inplace
(
struct
ggml_tensor
*
tensor
);
void
ggml_cuda_set_main_device
(
int
main_device
);
void
ggml_cuda_set_main_device
(
int
main_device
);
void
ggml_cuda_set_mul_mat_q
(
bool
mul_mat_q
);
void
ggml_cuda_set_scratch_size
(
size_t
scratch_size
);
void
ggml_cuda_set_scratch_size
(
size_t
scratch_size
);
void
ggml_cuda_free_scratch
(
void
);
void
ggml_cuda_free_scratch
(
void
);
bool
ggml_cuda_compute_forward
(
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
tensor
);
bool
ggml_cuda_compute_forward
(
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
tensor
);
...
...
llama/ggml-metal.h
View file @
7a1c3e62
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.m
View file @
7a1c3e62
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -746,7 +746,8 @@ void ggml_metal_graph_compute(
...
@@ -746,7 +746,8 @@ void ggml_metal_graph_compute(
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT
(
ne00
==
ne10
);
GGML_ASSERT
(
ne00
==
ne10
);
GGML_ASSERT
(
ne02
==
ne12
);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
GGML_ASSERT
(
ne03
==
ne13
);
if
(
ggml_is_contiguous
(
src0
)
&&
if
(
ggml_is_contiguous
(
src0
)
&&
ggml_is_contiguous
(
src1
)
&&
ggml_is_contiguous
(
src1
)
&&
...
@@ -774,11 +775,11 @@ void ggml_metal_graph_compute(
...
@@ -774,11 +775,11 @@ void ggml_metal_graph_compute(
initWithDevice:
ctx
->
device
transposeLeft
:
false
transposeRight
:
true
initWithDevice:
ctx
->
device
transposeLeft
:
false
transposeRight
:
true
resultRows:
ne11
resultColumns
:
ne01
interiorColumns
:
ne00
alpha
:
1
.
0
beta
:
0
.
0
];
resultRows:
ne11
resultColumns
:
ne01
interiorColumns
:
ne00
alpha
:
1
.
0
beta
:
0
.
0
];
// we need to do ne
0
2 multiplications
// we need to do ne
1
2 multiplications
// TODO: is there a way to do this in parallel - currently very slow ..
// TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for
(
int64_t
i02
=
0
;
i02
<
ne
0
2
;
++
i02
)
{
for
(
int64_t
i02
=
0
;
i02
<
ne
1
2
;
++
i02
)
{
size_t
offs_src0_cur
=
offs_src0
+
i02
*
nb02
;
size_t
offs_src0_cur
=
offs_src0
+
i02
/
(
ne12
/
ne02
)
*
nb02
;
// gqa not used for now
size_t
offs_src1_cur
=
offs_src1
+
i02
*
nb12
;
size_t
offs_src1_cur
=
offs_src1
+
i02
*
nb12
;
size_t
offs_dst_cur
=
offs_dst
+
i02
*
nb2
;
size_t
offs_dst_cur
=
offs_dst
+
i02
*
nb2
;
...
@@ -800,8 +801,6 @@ void ggml_metal_graph_compute(
...
@@ -800,8 +801,6 @@ void ggml_metal_graph_compute(
switch
(
src0t
)
{
switch
(
src0t
)
{
case
GGML_TYPE_F16
:
case
GGML_TYPE_F16
:
{
{
GGML_ASSERT
(
ne02
==
ne12
);
nth0
=
64
;
nth0
=
64
;
nth1
=
1
;
nth1
=
1
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_f16_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_f16_f32
];
...
@@ -881,16 +880,18 @@ void ggml_metal_graph_compute(
...
@@ -881,16 +880,18 @@ void ggml_metal_graph_compute(
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
2
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
2
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
ne00
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
ne00
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
ne01
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
ne01
)
atIndex
:
4
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
nb00
)
atIndex
:
5
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
ne02
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
nb01
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
nb00
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
nb02
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
nb01
)
atIndex
:
7
];
[
encoder
setBytes
:
&
ne10
length
:
sizeof
(
ne10
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
nb02
)
atIndex
:
8
];
[
encoder
setBytes
:
&
ne11
length
:
sizeof
(
ne11
)
atIndex
:
9
];
[
encoder
setBytes
:
&
ne10
length
:
sizeof
(
ne10
)
atIndex
:
9
];
[
encoder
setBytes
:
&
nb10
length
:
sizeof
(
nb10
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne11
length
:
sizeof
(
ne11
)
atIndex
:
10
];
[
encoder
setBytes
:
&
nb11
length
:
sizeof
(
nb11
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne12
length
:
sizeof
(
ne12
)
atIndex
:
11
];
[
encoder
setBytes
:
&
nb12
length
:
sizeof
(
nb12
)
atIndex
:
12
];
[
encoder
setBytes
:
&
nb10
length
:
sizeof
(
nb10
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
nb11
length
:
sizeof
(
nb11
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb12
length
:
sizeof
(
nb12
)
atIndex
:
14
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
15
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
16
];
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
||
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
||
src0t
==
GGML_TYPE_Q2_K
||
src0t
==
GGML_TYPE_Q4_K
)
{
src0t
==
GGML_TYPE_Q2_K
||
src0t
==
GGML_TYPE_Q4_K
)
{
...
...
llama/ggml-metal.metal
View file @
7a1c3e62
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -537,11 +537,13 @@ kernel void kernel_mul_mat_f16_f32(
...
@@ -537,11 +537,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb12,
...
@@ -557,7 +559,7 @@ kernel void kernel_mul_mat_f16_f32(
...
@@ -557,7 +559,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y;
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
device const half * x = (device const half *) (src0 + r0*nb01 + im
/(ne12/ne02)
*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f;
sum[tpitg.x] = 0.0f;
...
@@ -580,6 +582,7 @@ kernel void kernel_mul_mat_f16_f32(
...
@@ -580,6 +582,7 @@ kernel void kernel_mul_mat_f16_f32(
}
}
}
}
kernel void kernel_alibi_f32(
kernel void kernel_alibi_f32(
device const float * src0,
device const float * src0,
device float * dst,
device float * dst,
...
...
llama/ggml-mpi.c
View file @
7a1c3e62
//go:build mpi
//go:build mpi
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-mpi.h
View file @
7a1c3e62
//go:build mpi
//go:build mpi
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-opencl.cpp
View file @
7a1c3e62
//go:build opencl
//go:build opencl
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-opencl.h
View file @
7a1c3e62
//go:build opencl
//go:build opencl
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml.c
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -4585,8 +4585,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
...
@@ -4585,8 +4585,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_context * ctx,
struct ggml_context * ctx,
enum ggml_type type,
enum ggml_type type,
int n_dims,
int n_dims,
const int64_t* ne,
const int64_t * ne,
void* data) {
void * data) {
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
size_t data_size = 0;
size_t data_size = 0;
...
@@ -6264,6 +6266,27 @@ struct ggml_tensor * ggml_reshape_4d(
...
@@ -6264,6 +6266,27 @@ struct ggml_tensor * ggml_reshape_4d(
// ggml_view_1d
// ggml_view_1d
static struct ggml_tensor * ggml_view_tensor_offset(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_dims,
const int64_t * ne,
size_t offset) {
// don't calculate an offset from an unallocated tensor
void * data = NULL;
if (a->data != NULL) {
data = (char *) a->data + offset;
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
return result;
}
struct ggml_tensor * ggml_view_1d(
struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * a,
...
@@ -6276,10 +6299,7 @@ struct ggml_tensor * ggml_view_1d(
...
@@ -6276,10 +6299,7 @@ struct ggml_tensor * ggml_view_1d(
is_node = true;
is_node = true;
}
}
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
result->op = GGML_OP_VIEW;
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
...
@@ -6306,10 +6326,7 @@ struct ggml_tensor * ggml_view_2d(
...
@@ -6306,10 +6326,7 @@ struct ggml_tensor * ggml_view_2d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
result->nb[1] = nb1;
result->nb[1] = nb1;
result->nb[2] = result->nb[1]*ne1;
result->nb[2] = result->nb[1]*ne1;
...
@@ -6342,10 +6359,7 @@ struct ggml_tensor * ggml_view_3d(
...
@@ -6342,10 +6359,7 @@ struct ggml_tensor * ggml_view_3d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
result->nb[1] = nb1;
result->nb[1] = nb1;
result->nb[2] = nb2;
result->nb[2] = nb2;
...
@@ -6380,10 +6394,7 @@ struct ggml_tensor * ggml_view_4d(
...
@@ -6380,10 +6394,7 @@ struct ggml_tensor * ggml_view_4d(
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
ggml_format_name(result, "%s (view)", a->name);
ggml_set_op_params(result, &offset, sizeof(offset));
result->nb[1] = nb1;
result->nb[1] = nb1;
result->nb[2] = nb2;
result->nb[2] = nb2;
...
@@ -6767,6 +6778,18 @@ struct ggml_tensor * ggml_rope_inplace(
...
@@ -6767,6 +6778,18 @@ struct ggml_tensor * ggml_rope_inplace(
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
}
}
struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
int n_ctx,
float freq_base,
float freq_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
}
struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * a,
...
...
llama/ggml.h
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -1196,7 +1196,18 @@ extern "C" {
...
@@ -1196,7 +1196,18 @@ extern "C" {
int
mode
,
int
mode
,
int
n_ctx
);
int
n_ctx
);
// custom RoPE, in-place, returns view(a)
// custom RoPE
GGML_API
struct
ggml_tensor
*
ggml_rope_custom
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
int
n_past
,
int
n_dims
,
int
mode
,
int
n_ctx
,
float
freq_base
,
float
freq_scale
);
// in-place, returns view(a)
GGML_API
struct
ggml_tensor
*
ggml_rope_custom_inplace
(
GGML_API
struct
ggml_tensor
*
ggml_rope_custom_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
a
,
...
...
llama/k_quants.c
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -65,6 +65,8 @@
...
@@ -65,6 +65,8 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
//
//
// 2-6 bit quantization in super-blocks
// 2-6 bit quantization in super-blocks
//
//
...
@@ -1379,7 +1381,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -1379,7 +1381,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales8
);
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales8
);
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
const
__m256i
scales
[
2
]
=
{
_mm
256_
set_m
128
i
(
l_scales
,
l_scales
),
_mm
256_
set_m
128
i
(
h_scales
,
h_scales
)};
const
__m256i
scales
[
2
]
=
{
MM
256_
SET_M
128
I
(
l_scales
,
l_scales
),
MM
256_
SET_M
128
I
(
h_scales
,
h_scales
)};
__m256i
sumi
=
_mm256_setzero_si256
();
__m256i
sumi
=
_mm256_setzero_si256
();
...
@@ -1447,7 +1449,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -1447,7 +1449,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
summs_1
=
_mm_madd_epi16
(
mins_1
,
_mm_loadu_si128
((
const
__m128i
*
)
&
y
[
i
].
bsums
[
8
]));
const
__m128i
summs_1
=
_mm_madd_epi16
(
mins_1
,
_mm_loadu_si128
((
const
__m128i
*
)
&
y
[
i
].
bsums
[
8
]));
// sumf += -dmin * summs in 32bits*8
// sumf += -dmin * summs in 32bits*8
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dmin
),
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
summs_1
,
summs_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dmin
),
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
summs_1
,
summs_0
))),
acc
);
const
__m128i
scales_0
=
_mm_cvtepi8_epi16
(
scales16
);
const
__m128i
scales_0
=
_mm_cvtepi8_epi16
(
scales16
);
const
__m128i
scales_1
=
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
scales16
,
scales16
));
const
__m128i
scales_1
=
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
scales16
,
scales16
));
...
@@ -1519,7 +1521,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -1519,7 +1521,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
}
}
// sumf += dall * isum - dmin * summs in 32bits
// sumf += dall * isum - dmin * summs in 32bits
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dall
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
dall
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
}
...
@@ -1670,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -1670,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
summs
+=
dmin
*
smin
;
summs
+=
dmin
*
smin
;
const
__m128i
q2bits
=
_mm_loadu_si128
((
const
__m128i
*
)
q2
);
const
__m128i
q2bits
=
_mm_loadu_si128
((
const
__m128i
*
)
q2
);
const
__m256i
q2_0
=
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q2bits
,
2
),
q2bits
),
m3
);
const
__m256i
q2_0
=
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q2bits
,
2
),
q2bits
),
m3
);
const
__m256i
q2_1
=
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q2bits
,
6
),
_mm_srli_epi16
(
q2bits
,
4
)),
m3
);
const
__m256i
q2_1
=
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q2bits
,
6
),
_mm_srli_epi16
(
q2bits
,
4
)),
m3
);
const
__m256i
q8_0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
q8
+
0
));
const
__m256i
q8_0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
q8
+
0
));
const
__m256i
q8_1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
q8
+
32
));
const
__m256i
q8_1
=
_mm256_loadu_si256
((
const
__m256i
*
)(
q8
+
32
));
...
@@ -1735,10 +1737,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -1735,10 +1737,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
p2
=
_mm_maddubs_epi16
(
q2_2
,
_mm256_extractf128_si256
(
q8_1
,
0
));
const
__m128i
p2
=
_mm_maddubs_epi16
(
q2_2
,
_mm256_extractf128_si256
(
q8_1
,
0
));
const
__m128i
p3
=
_mm_maddubs_epi16
(
q2_3
,
_mm256_extractf128_si256
(
q8_1
,
1
));
const
__m128i
p3
=
_mm_maddubs_epi16
(
q2_3
,
_mm256_extractf128_si256
(
q8_1
,
1
));
const
__m256i
p_0
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p0
,
p0
)),
_mm_cvtepi16_epi32
(
p0
));
const
__m256i
p_0
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p0
,
p0
)),
_mm_cvtepi16_epi32
(
p0
));
const
__m256i
p_1
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p1
,
p1
)),
_mm_cvtepi16_epi32
(
p1
));
const
__m256i
p_1
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p1
,
p1
)),
_mm_cvtepi16_epi32
(
p1
));
const
__m256i
p_2
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p2
,
p2
)),
_mm_cvtepi16_epi32
(
p2
));
const
__m256i
p_2
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p2
,
p2
)),
_mm_cvtepi16_epi32
(
p2
));
const
__m256i
p_3
=
_mm
256_
set_m
128
i
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p3
,
p3
)),
_mm_cvtepi16_epi32
(
p3
));
const
__m256i
p_3
=
MM
256_
SET_M
128
I
(
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
p3
,
p3
)),
_mm_cvtepi16_epi32
(
p3
));
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
*
db
[
0
]),
_mm256_cvtepi32_ps
(
p_0
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
*
db
[
0
]),
_mm256_cvtepi32_ps
(
p_0
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
*
db
[
1
]),
_mm256_cvtepi32_ps
(
p_1
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
*
db
[
1
]),
_mm256_cvtepi32_ps
(
p_1
)),
acc
);
...
@@ -1943,7 +1945,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -1943,7 +1945,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales128
);
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales128
);
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
const
__m256i
scales
[
2
]
=
{
_mm
256_
set_m
128
i
(
l_scales
,
l_scales
),
_mm
256_
set_m
128
i
(
h_scales
,
h_scales
)};
const
__m256i
scales
[
2
]
=
{
MM
256_
SET_M
128
I
(
l_scales
,
l_scales
),
MM
256_
SET_M
128
I
(
h_scales
,
h_scales
)};
// high bit
// high bit
const
__m256i
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
].
hmask
);
const
__m256i
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
].
hmask
);
...
@@ -2154,7 +2156,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2154,7 +2156,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
}
}
// multiply with block scale and accumulate
// multiply with block scale and accumulate
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
}
...
@@ -2329,13 +2331,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2329,13 +2331,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
aux16
[
0
]
=
a
&
0x0f0f
;
aux16
[
0
]
=
a
&
0x0f0f
;
aux16
[
1
]
=
(
a
>>
4
)
&
0x0f0f
;
aux16
[
1
]
=
(
a
>>
4
)
&
0x0f0f
;
const
__m256i
scale_0
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
aux8
[
2
]
-
8
),
_mm_set1_epi16
(
aux8
[
0
]
-
8
));
const
__m256i
scale_0
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
aux8
[
2
]
-
8
),
_mm_set1_epi16
(
aux8
[
0
]
-
8
));
const
__m256i
scale_1
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
aux8
[
3
]
-
8
),
_mm_set1_epi16
(
aux8
[
1
]
-
8
));
const
__m256i
scale_1
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
aux8
[
3
]
-
8
),
_mm_set1_epi16
(
aux8
[
1
]
-
8
));
memcpy
(
&
aux64
,
x
[
i
].
hmask
,
8
);
memcpy
(
&
aux64
,
x
[
i
].
hmask
,
8
);
const
__m128i
haux
=
_mm_set_epi64x
(
aux64
>>
1
,
aux64
>>
0
);
const
__m128i
haux
=
_mm_set_epi64x
(
aux64
>>
1
,
aux64
>>
0
);
__m256i
q3h_0
=
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
haux
,
2
),
haux
);
__m256i
q3h_0
=
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
haux
,
2
),
haux
);
__m256i
q3h_1
=
_mm256_srli_epi16
(
q3h_0
,
4
);
__m256i
q3h_1
=
_mm256_srli_epi16
(
q3h_0
,
4
);
q3h_0
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
q3h_0
,
m1
),
2
);
q3h_0
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
q3h_0
,
m1
),
2
);
q3h_1
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
q3h_1
,
m1
),
2
);
q3h_1
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
q3h_1
,
m1
),
2
);
...
@@ -2344,7 +2346,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2344,7 +2346,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
q3bits
=
_mm_loadu_si128
((
const
__m128i
*
)
q3
);
const
__m128i
q3bits
=
_mm_loadu_si128
((
const
__m128i
*
)
q3
);
// prepare low and high bits
// prepare low and high bits
const
__m256i
q3aux
=
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q3bits
,
2
),
q3bits
);
const
__m256i
q3aux
=
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q3bits
,
2
),
q3bits
);
const
__m256i
q3l_0
=
_mm256_and_si256
(
q3aux
,
m3
);
const
__m256i
q3l_0
=
_mm256_and_si256
(
q3aux
,
m3
);
const
__m256i
q3l_1
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q3aux
,
4
),
m3
);
const
__m256i
q3l_1
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q3aux
,
4
),
m3
);
...
@@ -2455,7 +2457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2455,7 +2457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
p16_0
=
_mm_add_epi32
(
p16_0
,
p16_2
);
p16_0
=
_mm_add_epi32
(
p16_0
,
p16_2
);
p16_1
=
_mm_add_epi32
(
p16_1
,
p16_3
);
p16_1
=
_mm_add_epi32
(
p16_1
,
p16_3
);
__m256i
p16
=
_mm
256_
set_m
128
i
(
p16_1
,
p16_0
);
__m256i
p16
=
MM
256_
SET_M
128
I
(
p16_1
,
p16_0
);
// multiply with block scale and accumulate
// multiply with block scale and accumulate
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
p16
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
p16
)),
acc
);
...
@@ -2646,7 +2648,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2646,7 +2648,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
acc_m
=
_mm_fmadd_ps
(
_mm_set1_ps
(
dmin
),
_mm_cvtepi32_ps
(
prod
),
acc_m
);
acc_m
=
_mm_fmadd_ps
(
_mm_set1_ps
(
dmin
),
_mm_cvtepi32_ps
(
prod
),
acc_m
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
const
__m256i
scales
=
_mm
256_
set_m
128
i
(
sc128
,
sc128
);
const
__m256i
scales
=
MM
256_
SET_M
128
I
(
sc128
,
sc128
);
__m256i
sumi
=
_mm256_setzero_si256
();
__m256i
sumi
=
_mm256_setzero_si256
();
...
@@ -2753,7 +2755,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2753,7 +2755,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
}
}
__m256
vd
=
_mm256_set1_ps
(
d
);
__m256
vd
=
_mm256_set1_ps
(
d
);
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
}
...
@@ -2994,11 +2996,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -2994,11 +2996,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
p32_0
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
0
]),
p16_0
);
const
__m128i
p32_0
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
0
]),
p16_0
);
const
__m128i
p32_1
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
0
]),
p16_1
);
const
__m128i
p32_1
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
0
]),
p16_1
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
p32_1
,
p32_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
p32_1
,
p32_0
))),
acc
);
const
__m128i
p32_2
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
1
]),
p16_2
);
const
__m128i
p32_2
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
1
]),
p16_2
);
const
__m128i
p32_3
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
1
]),
p16_3
);
const
__m128i
p32_3
=
_mm_madd_epi16
(
_mm_set1_epi16
(
scales
[
1
]),
p16_3
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
p32_3
,
p32_2
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
p32_3
,
p32_2
))),
acc
);
}
}
...
@@ -3186,7 +3188,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -3186,7 +3188,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
summs
+=
dmin
*
_mm_extract_epi32
(
hsum
,
0
);
summs
+=
dmin
*
_mm_extract_epi32
(
hsum
,
0
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
const
__m256i
scales
=
_mm
256_
set_m
128
i
(
sc128
,
sc128
);
const
__m256i
scales
=
MM
256_
SET_M
128
I
(
sc128
,
sc128
);
const
__m256i
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
].
qh
);
const
__m256i
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
].
qh
);
__m256i
hmask
=
mone
;
__m256i
hmask
=
mone
;
...
@@ -3325,7 +3327,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -3325,7 +3327,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
}
}
__m256
vd
=
_mm256_set1_ps
(
d
);
__m256
vd
=
_mm256_set1_ps
(
d
);
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
}
...
@@ -3488,13 +3490,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -3488,13 +3490,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
q5bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q5
);
const
__m256i
q5bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q5
);
const
__m256i
scale_l
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
x
[
i
].
scales
[
1
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
0
]));
const
__m256i
scale_l
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
x
[
i
].
scales
[
1
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
0
]));
const
__m256i
scale_h
=
_mm
256_
set_m
128
i
(
_mm_set1_epi16
(
x
[
i
].
scales
[
3
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
2
]));
const
__m256i
scale_h
=
MM
256_
SET_M
128
I
(
_mm_set1_epi16
(
x
[
i
].
scales
[
3
]),
_mm_set1_epi16
(
x
[
i
].
scales
[
2
]));
int64_t
aux64
;
int64_t
aux64
;
memcpy
(
&
aux64
,
x
[
i
].
qh
,
8
);
memcpy
(
&
aux64
,
x
[
i
].
qh
,
8
);
const
__m128i
haux128
=
_mm_set_epi64x
(
aux64
>>
1
,
aux64
);
const
__m128i
haux128
=
_mm_set_epi64x
(
aux64
>>
1
,
aux64
);
const
__m256i
haux256
=
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
haux128
,
2
),
haux128
);
const
__m256i
haux256
=
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
haux128
,
2
),
haux128
);
const
__m256i
q5h_0
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
haux256
,
mone
),
4
);
const
__m256i
q5h_0
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
haux256
,
mone
),
4
);
const
__m256i
q5h_1
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
_mm256_srli_epi16
(
haux256
,
4
),
mone
),
4
);
const
__m256i
q5h_1
=
_mm256_slli_epi16
(
_mm256_andnot_si256
(
_mm256_srli_epi16
(
haux256
,
4
),
mone
),
4
);
...
@@ -3569,7 +3571,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -3569,7 +3571,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const
__m128i
dot_0
=
_mm_sub_epi32
(
_mm_add_epi32
(
p16_0
,
p16_2
),
_mm_add_epi32
(
s16_0
,
s16_2
));
const
__m128i
dot_0
=
_mm_sub_epi32
(
_mm_add_epi32
(
p16_0
,
p16_2
),
_mm_add_epi32
(
s16_0
,
s16_2
));
const
__m128i
dot_1
=
_mm_sub_epi32
(
_mm_add_epi32
(
p16_1
,
p16_3
),
_mm_add_epi32
(
s16_1
,
s16_3
));
const
__m128i
dot_1
=
_mm_sub_epi32
(
_mm_add_epi32
(
p16_1
,
p16_3
),
_mm_add_epi32
(
s16_1
,
s16_3
));
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
),
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
dot_1
,
dot_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_set1_ps
(
d
),
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
dot_1
,
dot_0
))),
acc
);
}
}
...
@@ -3951,7 +3953,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -3951,7 +3953,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
}
}
__m256i
sumi
=
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
);
__m256i
sumi
=
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
sumi
)),
acc
);
}
}
...
@@ -4109,8 +4111,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -4109,8 +4111,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
const
__m256i
q4bits1
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
);
const
__m256i
q4bits1
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
);
const
__m128i
q4bitsH
=
_mm_loadu_si128
((
const
__m128i
*
)
qh
);
const
__m128i
q4bitsH
=
_mm_loadu_si128
((
const
__m128i
*
)
qh
);
const
__m256i
q4h_0
=
_mm256_slli_epi16
(
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q4bitsH
,
2
),
q4bitsH
),
m2
),
4
);
const
__m256i
q4h_0
=
_mm256_slli_epi16
(
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q4bitsH
,
2
),
q4bitsH
),
m2
),
4
);
const
__m256i
q4h_1
=
_mm256_slli_epi16
(
_mm256_and_si256
(
_mm
256_
set_m
128
i
(
_mm_srli_epi16
(
q4bitsH
,
6
),
_mm_srli_epi16
(
q4bitsH
,
4
)),
m2
),
4
);
const
__m256i
q4h_1
=
_mm256_slli_epi16
(
_mm256_and_si256
(
MM
256_
SET_M
128
I
(
_mm_srli_epi16
(
q4bitsH
,
6
),
_mm_srli_epi16
(
q4bitsH
,
4
)),
m2
),
4
);
const
__m256i
q4_0
=
_mm256_or_si256
(
_mm256_and_si256
(
q4bits1
,
m4
),
q4h_0
);
const
__m256i
q4_0
=
_mm256_or_si256
(
_mm256_and_si256
(
q4bits1
,
m4
),
q4h_0
);
const
__m256i
q4_1
=
_mm256_or_si256
(
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits1
,
4
),
m4
),
q4h_1
);
const
__m256i
q4_1
=
_mm256_or_si256
(
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits1
,
4
),
m4
),
q4h_1
);
...
@@ -4203,7 +4205,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
...
@@ -4203,7 +4205,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
sumi_0
=
_mm_add_epi32
(
sumi_0
,
_mm_add_epi32
(
p16_0
,
p16_2
));
sumi_0
=
_mm_add_epi32
(
sumi_0
,
_mm_add_epi32
(
p16_0
,
p16_2
));
sumi_1
=
_mm_add_epi32
(
sumi_1
,
_mm_add_epi32
(
p16_1
,
p16_3
));
sumi_1
=
_mm_add_epi32
(
sumi_1
,
_mm_add_epi32
(
p16_1
,
p16_3
));
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
_mm
256_
set_m
128
i
(
sumi_1
,
sumi_0
))),
acc
);
acc
=
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_broadcast_ss
(
&
d
),
_mm256_cvtepi32_ps
(
MM
256_
SET_M
128
I
(
sumi_1
,
sumi_0
))),
acc
);
}
}
*
s
=
hsum_float_8
(
acc
);
*
s
=
hsum_float_8
(
acc
);
...
...
llama/k_quants.h
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/llama-util.h
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
...
llama/llama.cpp
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -82,8 +82,14 @@
...
@@ -82,8 +82,14 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#endif
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
#include "ggml-alloc.h"
#define LLAMA_USE_ALLOCATOR
#else
#define LLAMA_USE_SCRATCH
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
#define LLAMA_MAX_SCRATCH_BUFFERS 16
#endif
// available llama models
// available llama models
enum
e_model
{
enum
e_model
{
...
@@ -353,13 +359,22 @@ struct llama_model {
...
@@ -353,13 +359,22 @@ struct llama_model {
struct
llama_context
{
struct
llama_context
{
llama_context
(
const
llama_model
&
model
)
:
model
(
model
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
llama_context
(
const
llama_model
&
model
)
:
model
(
model
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
#ifdef GGML_USE_METAL
~
llama_context
()
{
~
llama_context
()
{
if
(
model_owner
)
{
delete
&
model
;
}
#ifdef GGML_USE_METAL
if
(
ctx_metal
)
{
if
(
ctx_metal
)
{
ggml_metal_free
(
ctx_metal
);
ggml_metal_free
(
ctx_metal
);
}
}
#endif
#ifdef LLAMA_USE_ALLOCATOR
if
(
alloc
)
{
ggml_allocr_free
(
alloc
);
}
}
#endif
#endif
}
std
::
mt19937
rng
;
std
::
mt19937
rng
;
bool
has_evaluated_once
=
false
;
bool
has_evaluated_once
=
false
;
...
@@ -397,7 +412,17 @@ struct llama_context {
...
@@ -397,7 +412,17 @@ struct llama_context {
// memory buffers used to evaluate the model
// memory buffers used to evaluate the model
// TODO: move in llama_state
// TODO: move in llama_state
llama_ctx_buffer
buf_compute
;
llama_ctx_buffer
buf_compute
;
#ifdef LLAMA_USE_ALLOCATOR
llama_ctx_buffer
buf_alloc
;
ggml_allocr
*
alloc
=
NULL
;
#endif
#ifdef LLAMA_USE_SCRATCH
llama_ctx_buffer
buf_scratch
[
LLAMA_MAX_SCRATCH_BUFFERS
];
llama_ctx_buffer
buf_scratch
[
LLAMA_MAX_SCRATCH_BUFFERS
];
int
buf_last
=
0
;
size_t
buf_max_size
[
LLAMA_MAX_SCRATCH_BUFFERS
]
=
{
0
};
#endif
#ifdef GGML_USE_METAL
#ifdef GGML_USE_METAL
ggml_metal_context
*
ctx_metal
=
NULL
;
ggml_metal_context
*
ctx_metal
=
NULL
;
...
@@ -407,9 +432,6 @@ struct llama_context {
...
@@ -407,9 +432,6 @@ struct llama_context {
ggml_mpi_context
*
ctx_mpi
=
NULL
;
ggml_mpi_context
*
ctx_mpi
=
NULL
;
#endif
#endif
int
buf_last
=
0
;
size_t
buf_max_size
[
LLAMA_MAX_SCRATCH_BUFFERS
]
=
{
0
};
void
use_buf
(
struct
ggml_context
*
ctx
,
int
i
)
{
void
use_buf
(
struct
ggml_context
*
ctx
,
int
i
)
{
#if defined(LLAMA_USE_SCRATCH)
#if defined(LLAMA_USE_SCRATCH)
size_t
last_size
=
0
;
size_t
last_size
=
0
;
...
@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() {
...
@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() {
/*.progress_callback =*/
nullptr
,
/*.progress_callback =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.low_vram =*/
false
,
/*.low_vram =*/
false
,
/*.mul_mat_q =*/
false
,
/*.f16_kv =*/
true
,
/*.f16_kv =*/
true
,
/*.logits_all =*/
false
,
/*.logits_all =*/
false
,
/*.vocab_only =*/
false
,
/*.vocab_only =*/
false
,
...
@@ -1032,6 +1055,7 @@ static void llama_model_load_internal(
...
@@ -1032,6 +1055,7 @@ static void llama_model_load_internal(
int
n_gpu_layers
,
int
n_gpu_layers
,
int
main_gpu
,
int
main_gpu
,
const
float
*
tensor_split
,
const
float
*
tensor_split
,
const
bool
mul_mat_q
,
float
rope_freq_base
,
float
rope_freq_base
,
float
rope_freq_scale
,
float
rope_freq_scale
,
bool
low_vram
,
bool
low_vram
,
...
@@ -1160,9 +1184,11 @@ static void llama_model_load_internal(
...
@@ -1160,9 +1184,11 @@ static void llama_model_load_internal(
}
}
(
void
)
main_gpu
;
(
void
)
main_gpu
;
(
void
)
mul_mat_q
;
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS)
fprintf
(
stderr
,
"%s: using CUDA for GPU acceleration
\n
"
,
__func__
);
fprintf
(
stderr
,
"%s: using CUDA for GPU acceleration
\n
"
,
__func__
);
ggml_cuda_set_main_device
(
main_gpu
);
ggml_cuda_set_main_device
(
main_gpu
);
ggml_cuda_set_mul_mat_q
(
mul_mat_q
);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#elif defined(GGML_USE_CLBLAST)
#elif defined(GGML_USE_CLBLAST)
...
@@ -1256,12 +1282,16 @@ static void llama_model_load_internal(
...
@@ -1256,12 +1282,16 @@ static void llama_model_load_internal(
const
size_t
scale
=
memory_type
==
GGML_TYPE_F32
?
2
:
1
;
const
size_t
scale
=
memory_type
==
GGML_TYPE_F32
?
2
:
1
;
// this is the total memory required to run the inference
// this is the total memory required to run the inference
const
size_t
mem_required
=
size_t
mem_required
=
ctx_size
+
ctx_size
+
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
mmapped_size
-
vram_weights
;
// weights in VRAM not in memory
#ifndef LLAMA_USE_ALLOCATOR
mem_required
+=
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
model
.
type
)
+
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_EVAL
().
at
(
model
.
type
);
MEM_REQ_EVAL
().
at
(
model
.
type
);
#endif
// this is the memory required by one llama_state
// this is the memory required by one llama_state
const
size_t
mem_required_state
=
const
size_t
mem_required_state
=
...
@@ -1367,6 +1397,7 @@ static bool llama_model_load(
...
@@ -1367,6 +1397,7 @@ static bool llama_model_load(
int
n_gpu_layers
,
int
n_gpu_layers
,
int
main_gpu
,
int
main_gpu
,
const
float
*
tensor_split
,
const
float
*
tensor_split
,
const
bool
mul_mat_q
,
float
rope_freq_base
,
float
rope_freq_base
,
float
rope_freq_scale
,
float
rope_freq_scale
,
bool
low_vram
,
bool
low_vram
,
...
@@ -1377,7 +1408,8 @@ static bool llama_model_load(
...
@@ -1377,7 +1408,8 @@ static bool llama_model_load(
llama_progress_callback
progress_callback
,
llama_progress_callback
progress_callback
,
void
*
progress_callback_user_data
)
{
void
*
progress_callback_user_data
)
{
try
{
try
{
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gqa
,
rms_norm_eps
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gqa
,
rms_norm_eps
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
mul_mat_q
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
return
true
;
return
true
;
}
catch
(
const
std
::
exception
&
err
)
{
}
catch
(
const
std
::
exception
&
err
)
{
...
@@ -1386,32 +1418,15 @@ static bool llama_model_load(
...
@@ -1386,32 +1418,15 @@ static bool llama_model_load(
}
}
}
}
// evaluate the transformer
static
struct
ggml_cgraph
*
llama_build_graph
(
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - embd embeddings input
// - n_tokens number of tokens
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static
bool
llama_eval_internal
(
llama_context
&
lctx
,
llama_context
&
lctx
,
const
llama_token
*
tokens
,
const
llama_token
*
tokens
,
const
float
*
embd
,
const
float
*
embd
,
int
n_tokens
,
int
n_tokens
,
int
n_past
,
int
n_past
)
{
int
n_threads
,
const
char
*
cgraph_fname
)
{
LLAMA_ASSERT
((
!
tokens
&&
embd
)
||
(
tokens
&&
!
embd
));
LLAMA_ASSERT
((
!
tokens
&&
embd
)
||
(
tokens
&&
!
embd
));
#ifdef GGML_USE_MPI
ggml_mpi_eval_init
(
lctx
.
ctx_mpi
,
&
n_tokens
,
&
n_past
,
&
n_threads
);
#endif
const
int64_t
t_start_us
=
ggml_time_us
();
const
int
N
=
n_tokens
;
const
int
N
=
n_tokens
;
const
auto
&
model
=
lctx
.
model
;
const
auto
&
model
=
lctx
.
model
;
...
@@ -1427,10 +1442,8 @@ static bool llama_eval_internal(
...
@@ -1427,10 +1442,8 @@ static bool llama_eval_internal(
const
int64_t
n_head
=
hparams
.
n_head
;
const
int64_t
n_head
=
hparams
.
n_head
;
const
int64_t
n_head_kv
=
hparams
.
n_head_kv
;
const
int64_t
n_head_kv
=
hparams
.
n_head_kv
;
const
int64_t
n_embd_head
=
hparams
.
n_embd_head
();
const
int64_t
n_embd_head
=
hparams
.
n_embd_head
();
const
int64_t
n_vocab
=
hparams
.
n_vocab
;
const
int64_t
n_embd_gqa
=
hparams
.
n_embd_gqa
();
const
int64_t
n_embd_gqa
=
hparams
.
n_embd_gqa
();
LLAMA_ASSERT
(
n_embd_head
==
hparams
.
n_rot
);
LLAMA_ASSERT
(
n_embd_head
==
hparams
.
n_rot
);
const
float
freq_base
=
hparams
.
rope_freq_base
;
const
float
freq_base
=
hparams
.
rope_freq_base
;
...
@@ -1442,26 +1455,35 @@ static bool llama_eval_internal(
...
@@ -1442,26 +1455,35 @@ static bool llama_eval_internal(
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
struct
ggml_init_params
params
=
{
struct
ggml_init_params
params
=
{
/*.mem_size =*/
buf_compute
.
size
,
/*.mem_size =*/
buf_compute
.
size
,
/*.mem_buffer =*/
buf_compute
.
addr
,
/*.mem_buffer =*/
buf_compute
.
addr
,
/*.no_alloc =*/
false
,
/*.no_alloc =*/
false
,
};
};
#ifdef LLAMA_USE_ALLOCATOR
params
.
no_alloc
=
true
;
#endif
struct
ggml_context
*
ctx0
=
ggml_init
(
params
);
struct
ggml_context
*
ctx0
=
ggml_init
(
params
);
ggml_cgraph
*
gf
=
ggml_new_graph
(
ctx0
);
ggml_cgraph
*
gf
=
ggml_new_graph
(
ctx0
);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads
=
N
>=
32
&&
ggml_cpu_has_blas
()
&&
!
ggml_cpu_has_gpublas
()
?
1
:
n_threads
;
struct
ggml_tensor
*
cur
;
struct
ggml_tensor
*
cur
;
struct
ggml_tensor
*
inpL
;
struct
ggml_tensor
*
inpL
;
if
(
tokens
)
{
if
(
tokens
)
{
struct
ggml_tensor
*
inp_tokens
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
N
);
struct
ggml_tensor
*
inp_tokens
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
N
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc
(
lctx
.
alloc
,
inp_tokens
);
if
(
!
ggml_allocr_is_measure
(
lctx
.
alloc
))
{
memcpy
(
inp_tokens
->
data
,
tokens
,
N
*
ggml_element_size
(
inp_tokens
));
memcpy
(
inp_tokens
->
data
,
tokens
,
N
*
ggml_element_size
(
inp_tokens
));
}
#else
memcpy
(
inp_tokens
->
data
,
tokens
,
N
*
ggml_element_size
(
inp_tokens
));
#endif
ggml_set_name
(
inp_tokens
,
"inp_tokens"
);
ggml_set_name
(
inp_tokens
,
"inp_tokens"
);
inpL
=
ggml_get_rows
(
ctx0
,
model
.
tok_embeddings
,
inp_tokens
);
inpL
=
ggml_get_rows
(
ctx0
,
model
.
tok_embeddings
,
inp_tokens
);
...
@@ -1471,8 +1493,16 @@ static bool llama_eval_internal(
...
@@ -1471,8 +1493,16 @@ static bool llama_eval_internal(
#endif
#endif
inpL
=
ggml_new_tensor_2d
(
ctx0
,
GGML_TYPE_F32
,
n_embd
,
N
);
inpL
=
ggml_new_tensor_2d
(
ctx0
,
GGML_TYPE_F32
,
n_embd
,
N
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc
(
lctx
.
alloc
,
inpL
);
if
(
!
ggml_allocr_is_measure
(
lctx
.
alloc
))
{
memcpy
(
inpL
->
data
,
embd
,
N
*
n_embd
*
ggml_element_size
(
inpL
));
memcpy
(
inpL
->
data
,
embd
,
N
*
n_embd
*
ggml_element_size
(
inpL
));
}
}
#else
memcpy
(
inpL
->
data
,
embd
,
N
*
n_embd
*
ggml_element_size
(
inpL
));
#endif
}
const
int
i_gpu_start
=
n_layer
-
n_gpu_layers
;
const
int
i_gpu_start
=
n_layer
-
n_gpu_layers
;
(
void
)
i_gpu_start
;
(
void
)
i_gpu_start
;
...
@@ -1498,6 +1528,17 @@ static bool llama_eval_internal(
...
@@ -1498,6 +1528,17 @@ static bool llama_eval_internal(
}
}
#endif // GGML_USE_CUBLAS
#endif // GGML_USE_CUBLAS
struct
ggml_tensor
*
KQ_scale
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_F32
,
1
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc
(
lctx
.
alloc
,
KQ_scale
);
if
(
!
ggml_allocr_is_measure
(
lctx
.
alloc
))
{
ggml_set_f32
(
KQ_scale
,
1.0
f
/
sqrtf
(
float
(
n_embd
)
/
n_head
));
}
#else
ggml_set_f32
(
KQ_scale
,
1.0
f
/
sqrtf
(
float
(
n_embd
)
/
n_head
));
#endif
ggml_set_name
(
KQ_scale
,
"1/sqrt(n_embd_head)"
);
for
(
int
il
=
0
;
il
<
n_layer
;
++
il
)
{
for
(
int
il
=
0
;
il
<
n_layer
;
++
il
)
{
ggml_format_name
(
inpL
,
"layer_inp_%d"
,
il
);
ggml_format_name
(
inpL
,
"layer_inp_%d"
,
il
);
...
@@ -1593,9 +1634,6 @@ static bool llama_eval_internal(
...
@@ -1593,9 +1634,6 @@ static bool llama_eval_internal(
ggml_set_name
(
KQ
,
"KQ"
);
ggml_set_name
(
KQ
,
"KQ"
);
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled = KQ / sqrt(n_embd_head)
struct
ggml_tensor
*
KQ_scale
=
ggml_new_f32
(
ctx0
,
1.0
f
/
sqrtf
(
float
(
n_embd
)
/
n_head
));
ggml_set_name
(
KQ_scale
,
"1/sqrt(n_embd_head)"
);
// KQ_scaled shape [n_past + N, N, n_head, 1]
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct
ggml_tensor
*
KQ_scaled
=
ggml_scale_inplace
(
ctx0
,
KQ
,
KQ_scale
);
struct
ggml_tensor
*
KQ_scaled
=
ggml_scale_inplace
(
ctx0
,
KQ
,
KQ_scale
);
offload_func_kq
(
KQ_scaled
);
offload_func_kq
(
KQ_scaled
);
...
@@ -1711,9 +1749,6 @@ static bool llama_eval_internal(
...
@@ -1711,9 +1749,6 @@ static bool llama_eval_internal(
lctx
.
use_buf
(
ctx0
,
0
);
lctx
.
use_buf
(
ctx0
,
0
);
// used at the end to optionally extract the embeddings
struct
ggml_tensor
*
embeddings
=
NULL
;
// norm
// norm
{
{
cur
=
ggml_rms_norm
(
ctx0
,
inpL
,
rms_norm_eps
);
cur
=
ggml_rms_norm
(
ctx0
,
inpL
,
rms_norm_eps
);
...
@@ -1724,8 +1759,6 @@ static bool llama_eval_internal(
...
@@ -1724,8 +1759,6 @@ static bool llama_eval_internal(
cur
=
ggml_mul
(
ctx0
,
cur
,
model
.
norm
);
cur
=
ggml_mul
(
ctx0
,
cur
,
model
.
norm
);
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name
(
cur
,
"result_norm"
);
ggml_set_name
(
cur
,
"result_norm"
);
embeddings
=
cur
;
}
}
// lm_head
// lm_head
...
@@ -1737,12 +1770,88 @@ static bool llama_eval_internal(
...
@@ -1737,12 +1770,88 @@ static bool llama_eval_internal(
// logits -> probs
// logits -> probs
//cur = ggml_soft_max_inplace(ctx0, cur);
//cur = ggml_soft_max_inplace(ctx0, cur);
// run the computation
ggml_build_forward_expand
(
gf
,
cur
);
ggml_build_forward_expand
(
gf
,
cur
);
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
if
(
mem_per_token
==
0
)
{
mem_per_token
=
ggml_used_mem
(
ctx0
)
/
N
;
}
#if 0
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif
ggml_free
(
ctx0
);
return
gf
;
}
// evaluate the transformer
//
// - lctx: llama context
// - tokens: new batch of tokens to process
// - embd embeddings input
// - n_tokens number of tokens
// - n_past: the context size so far
// - n_threads: number of threads to use
//
static
bool
llama_eval_internal
(
llama_context
&
lctx
,
const
llama_token
*
tokens
,
const
float
*
embd
,
int
n_tokens
,
int
n_past
,
int
n_threads
,
const
char
*
cgraph_fname
)
{
LLAMA_ASSERT
((
!
tokens
&&
embd
)
||
(
tokens
&&
!
embd
));
const
int64_t
t_start_us
=
ggml_time_us
();
#ifdef GGML_USE_MPI
ggml_mpi_eval_init
(
lctx
.
ctx_mpi
,
&
n_tokens
,
&
n_past
,
&
n_threads
);
#endif
const
int
N
=
n_tokens
;
const
auto
&
model
=
lctx
.
model
;
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
kv_self
=
lctx
.
kv_self
;
LLAMA_ASSERT
(
!!
kv_self
.
ctx
);
const
int64_t
n_embd
=
hparams
.
n_embd
;
const
int64_t
n_vocab
=
hparams
.
n_vocab
;
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_reset
(
lctx
.
alloc
);
#endif
ggml_cgraph
*
gf
=
llama_build_graph
(
lctx
,
tokens
,
embd
,
n_tokens
,
n_past
);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocr_alloc_graph
(
lctx
.
alloc
,
gf
);
#endif
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads
=
N
>=
32
&&
ggml_cpu_has_blas
()
&&
!
ggml_cpu_has_gpublas
()
?
1
:
n_threads
;
struct
ggml_tensor
*
res
=
gf
->
nodes
[
gf
->
n_nodes
-
1
];
struct
ggml_tensor
*
embeddings
=
gf
->
nodes
[
gf
->
n_nodes
-
2
];
LLAMA_ASSERT
(
strcmp
(
res
->
name
,
"result_output"
)
==
0
);
LLAMA_ASSERT
(
strcmp
(
embeddings
->
name
,
"result_norm"
)
==
0
);
#if GGML_USE_MPI
#if GGML_USE_MPI
const
int64_t
n_layer
=
hparams
.
n_layer
;
ggml_mpi_graph_compute_pre
(
lctx
.
ctx_mpi
,
gf
,
n_layer
);
ggml_mpi_graph_compute_pre
(
lctx
.
ctx_mpi
,
gf
,
n_layer
);
#endif
#endif
...
@@ -1754,7 +1863,10 @@ static bool llama_eval_internal(
...
@@ -1754,7 +1863,10 @@ static bool llama_eval_internal(
//}
//}
ggml_metal_set_n_cb
(
lctx
.
ctx_metal
,
n_threads
);
ggml_metal_set_n_cb
(
lctx
.
ctx_metal
,
n_threads
);
ggml_metal_graph_compute
(
lctx
.
ctx_metal
,
gf
);
ggml_metal_graph_compute
(
lctx
.
ctx_metal
,
gf
);
ggml_metal_get_tensor
(
lctx
.
ctx_metal
,
cur
);
ggml_metal_get_tensor
(
lctx
.
ctx_metal
,
res
);
if
(
!
lctx
.
embedding
.
empty
())
{
ggml_metal_get_tensor
(
lctx
.
ctx_metal
,
embeddings
);
}
}
else
{
}
else
{
// IMPORTANT:
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
...
@@ -1785,8 +1897,6 @@ static bool llama_eval_internal(
...
@@ -1785,8 +1897,6 @@ static bool llama_eval_internal(
// update kv token count
// update kv token count
lctx
.
kv_self
.
n
=
n_past
+
N
;
lctx
.
kv_self
.
n
=
n_past
+
N
;
struct
ggml_tensor
*
res
=
gf
->
nodes
[
gf
->
n_nodes
-
1
];
if
(
cgraph_fname
)
{
if
(
cgraph_fname
)
{
ggml_graph_export
(
gf
,
cgraph_fname
);
ggml_graph_export
(
gf
,
cgraph_fname
);
}
}
...
@@ -1824,21 +1934,6 @@ static bool llama_eval_internal(
...
@@ -1824,21 +1934,6 @@ static bool llama_eval_internal(
memcpy
(
embedding_out
.
data
(),
(
float
*
)
ggml_get_data
(
embeddings
)
+
(
n_embd
*
(
N
-
1
)),
sizeof
(
float
)
*
n_embd
);
memcpy
(
embedding_out
.
data
(),
(
float
*
)
ggml_get_data
(
embeddings
)
+
(
n_embd
*
(
N
-
1
)),
sizeof
(
float
)
*
n_embd
);
}
}
if
(
mem_per_token
==
0
)
{
mem_per_token
=
ggml_used_mem
(
ctx0
)
/
N
;
}
#if 0
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif
ggml_free
(
ctx0
);
// measure the performance only for the single-token evals
// measure the performance only for the single-token evals
if
(
N
==
1
)
{
if
(
N
==
1
)
{
lctx
.
t_eval_us
+=
ggml_time_us
()
-
t_start_us
;
lctx
.
t_eval_us
+=
ggml_time_us
()
-
t_start_us
;
...
@@ -1950,7 +2045,9 @@ struct llama_tokenizer {
...
@@ -1950,7 +2045,9 @@ struct llama_tokenizer {
if
(
token
==
vocab_
.
token_to_id
.
end
())
{
if
(
token
==
vocab_
.
token_to_id
.
end
())
{
// output any symbols that did not form tokens as bytes.
// output any symbols that did not form tokens as bytes.
for
(
int
j
=
0
;
j
<
(
int
)
symbol
.
n
;
++
j
)
{
for
(
int
j
=
0
;
j
<
(
int
)
symbol
.
n
;
++
j
)
{
llama_vocab
::
id
token_id
=
static_cast
<
uint8_t
>
(
symbol
.
text
[
j
])
+
3
;
// NOTE: old version, before #2420 - not sure what are the implications of this
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
llama_vocab
::
id
token_id
=
vocab_
.
token_to_id
.
at
(
std
::
string
(
1
,
symbol
.
text
[
j
]));
output
.
push_back
(
token_id
);
output
.
push_back
(
token_id
);
}
}
}
else
{
}
else
{
...
@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file(
...
@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gqa
,
params
.
rms_norm_eps
,
params
.
n_gpu_layers
,
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gqa
,
params
.
rms_norm_eps
,
params
.
n_gpu_layers
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
mul_mat_q
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
params
.
progress_callback_user_data
))
{
delete
model
;
delete
model
;
...
@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model(
...
@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model(
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
}
}
#ifdef LLAMA_USE_ALLOCATOR
{
static
const
size_t
tensor_alignment
=
32
;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
ctx
->
buf_compute
.
resize
(
ggml_tensor_overhead
()
*
GGML_MAX_NODES
+
ggml_graph_overhead
());
// create measure allocator
ctx
->
alloc
=
ggml_allocr_new_measure
(
tensor_alignment
);
// build worst-case graph
int
n_tokens
=
std
::
min
((
int
)
hparams
.
n_ctx
,
params
.
n_batch
);
int
n_past
=
hparams
.
n_ctx
-
n_tokens
;
llama_token
token
=
llama_token_bos
();
// not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph
*
gf
=
llama_build_graph
(
*
ctx
,
&
token
,
NULL
,
n_tokens
,
n_past
);
// measure memory requirements for the graph
size_t
alloc_size
=
ggml_allocr_alloc_graph
(
ctx
->
alloc
,
gf
)
+
tensor_alignment
;
fprintf
(
stderr
,
"%s: compute buffer total size = %7.2f MB
\n
"
,
__func__
,
(
ctx
->
buf_compute
.
size
+
alloc_size
)
/
1024.0
/
1024.0
);
// debug - for comparison with scratch buffer
//size_t prev_req =
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
// MEM_REQ_EVAL().at(ctx->model.type);
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements
ggml_allocr_free
(
ctx
->
alloc
);
ctx
->
buf_alloc
.
resize
(
alloc_size
);
ctx
->
alloc
=
ggml_allocr_new
(
ctx
->
buf_alloc
.
addr
,
ctx
->
buf_alloc
.
size
,
tensor_alignment
);
}
#else
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
().
at
(
ctx
->
model
.
type
)
+
ggml_graph_overhead
());
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
().
at
(
ctx
->
model
.
type
)
+
ggml_graph_overhead
());
#endif
#ifdef LLAMA_USE_SCRATCH
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
#endif
}
}
#ifdef GGML_USE_METAL
#ifdef GGML_USE_METAL
...
@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file(
...
@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file(
}
}
void
llama_free
(
struct
llama_context
*
ctx
)
{
void
llama_free
(
struct
llama_context
*
ctx
)
{
if
(
ctx
->
model_owner
)
{
delete
&
ctx
->
model
;
}
delete
ctx
;
delete
ctx
;
}
}
...
...
llama/llama.h
View file @
7a1c3e62
/**
/**
* llama.cpp - git
d91f3f0c55663719ea03b76311e8c36ed55eb0e2
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
*
*
* MIT License
* MIT License
*
*
...
@@ -134,6 +134,7 @@ extern "C" {
...
@@ -134,6 +134,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
// Keep the booleans together to avoid misalignment during copy-by-value.
bool
low_vram
;
// if true, reduce VRAM usage at the cost of performance
bool
low_vram
;
// if true, reduce VRAM usage at the cost of performance
bool
mul_mat_q
;
// if true, use experimental mul_mat_q kernels
bool
f16_kv
;
// use fp16 for KV cache
bool
f16_kv
;
// use fp16 for KV cache
bool
logits_all
;
// the llama_eval() call computes all logits, not just the last one
bool
logits_all
;
// the llama_eval() call computes all logits, not just the last one
bool
vocab_only
;
// only load the vocabulary, no weights
bool
vocab_only
;
// only load the vocabulary, no weights
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment