Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
20c5fd39
Unverified
Commit
20c5fd39
authored
May 08, 2025
by
Devon Rifkin
Committed by
GitHub
May 08, 2025
Browse files
Merge branch 'main' into drifkin/array-head-count-simple
parents
d2ee599d
6e9a7a25
Changes
156
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
667 additions
and
212 deletions
+667
-212
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
+58
-39
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
+9
-6
ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh
ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh
+2
-0
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+3
-3
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+366
-40
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+3
-3
ml/backend/ggml/ggml/src/ggml.c
ml/backend/ggml/ggml/src/ggml.c
+56
-48
ml/backend/ggml/ggml/src/ggml.go
ml/backend/ggml/ggml/src/ggml.go
+5
-10
ml/backend/ggml/quantization.go
ml/backend/ggml/quantization.go
+83
-0
parser/parser.go
parser/parser.go
+33
-5
parser/parser_test.go
parser/parser_test.go
+1
-8
progress/bar.go
progress/bar.go
+1
-1
progress/progress.go
progress/progress.go
+16
-2
runner/llamarunner/runner.go
runner/llamarunner/runner.go
+0
-5
runner/ollamarunner/cache.go
runner/ollamarunner/cache.go
+1
-1
runner/ollamarunner/runner.go
runner/ollamarunner/runner.go
+7
-16
scripts/build_windows.ps1
scripts/build_windows.ps1
+1
-15
scripts/env.sh
scripts/env.sh
+0
-2
server/create.go
server/create.go
+17
-8
server/images.go
server/images.go
+5
-0
No files found.
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
View file @
20c5fd39
#include "quantize.cuh"
#include <cstdint>
static
__global__
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
const
int64_t
ix0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
static
__global__
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne0
,
const
int
ne1
,
const
int
ne2
)
{
const
int64_t
i0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
x
0
>=
kx0_padded
)
{
if
(
i0
>=
ne0
)
{
return
;
}
const
int64_t
ix1
=
blockIdx
.
y
;
const
int64_t
i1
=
blockIdx
.
y
;
const
int64_t
i2
=
blockIdx
.
z
%
ne2
;
const
int64_t
i3
=
blockIdx
.
z
/
ne2
;
const
int64_t
i_padded
=
ix1
*
kx0_padded
+
ix0
;
const
int64_t
&
i00
=
i0
;
const
int64_t
&
i01
=
i1
;
const
int64_t
&
i02
=
i2
;
const
int64_t
&
i03
=
i3
;
const
int64_t
i_cont
=
((
i3
*
ne2
+
i2
)
*
ne1
+
i1
)
*
ne0
+
i0
;
block_q8_1
*
y
=
(
block_q8_1
*
)
vy
;
const
int64_t
ib
=
i_
padded
/
QK8_1
;
// block index
const
int64_t
iqs
=
i_
padded
%
QK8_1
;
// quant index
const
int64_t
ib
=
i_
cont
/
QK8_1
;
// block index
const
int64_t
iqs
=
i_
cont
%
QK8_1
;
// quant index
const
float
xi
=
i
x
0
<
kx
?
x
[
i
x1
*
kx
+
i
x
0
]
:
0.0
f
;
const
float
xi
=
i0
<
ne00
?
x
[
i
03
*
s03
+
i02
*
s02
+
i01
*
s01
+
i
0
0
]
:
0.0
f
;
float
amax
=
fabsf
(
xi
);
float
sum
=
xi
;
amax
=
warp_reduce_max
(
amax
);
sum
=
warp_reduce_sum
(
sum
);
sum
=
warp_reduce_sum
(
sum
);
const
float
d
=
amax
/
127
;
const
float
d
=
amax
/
127
;
const
int8_t
q
=
amax
==
0.0
f
?
0
:
roundf
(
xi
/
d
);
y
[
ib
].
qs
[
iqs
]
=
q
;
...
...
@@ -39,29 +49,38 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
template
<
mmq_q8_1_ds_layout
ds_layout
>
static
__global__
void
quantize_mmq_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
kx0_padded
)
{
const
float
*
__restrict__
x
,
const
int32_t
*
__restrict__
ids
,
void
*
__restrict__
vy
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne0
,
const
int
ne1
,
const
int
ne2
)
{
constexpr
int
vals_per_scale
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
64
:
32
;
constexpr
int
vals_per_sum
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
16
:
32
;
const
int64_t
i
x
0
=
((
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
)
*
4
;
const
int64_t
i0
=
((
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
)
*
4
;
if
(
i
x
0
>=
kx0_padded
)
{
if
(
i0
>=
ne0
)
{
return
;
}
const
float4
*
x4
=
(
const
float4
*
)
x
;
const
int64_t
i1
=
blockIdx
.
y
;
const
int64_t
i2
=
blockIdx
.
z
%
ne2
;
const
int64_t
i3
=
blockIdx
.
z
/
ne2
;
const
int64_t
ix1
=
kx1
*
blockIdx
.
z
+
blockIdx
.
y
;
const
int64_t
i00
=
i0
;
const
int64_t
i01
=
ids
?
ids
[
i1
]
:
i1
;
const
int64_t
i02
=
i2
;
const
int64_t
i03
=
i3
;
const
float4
*
x4
=
(
const
float4
*
)
x
;
block_q8_1_mmq
*
y
=
(
block_q8_1_mmq
*
)
vy
;
const
int64_t
ib0
=
blockIdx
.
z
*
((
int64_t
)
gridDim
.
y
*
gridDim
.
x
*
blockDim
.
x
/
QK8_1
);
// first block of channel
const
int64_t
ib
=
ib0
+
(
i
x
0
/
(
4
*
QK8_1
))
*
kx
1
+
blockIdx
.
y
;
// block index in channel
const
int64_t
iqs
=
i
x
0
%
(
4
*
QK8_1
);
// quant index in block
const
int64_t
ib
=
ib0
+
(
i0
/
(
4
*
QK8_1
))
*
ne
1
+
blockIdx
.
y
;
// block index in channel
const
int64_t
iqs
=
i0
%
(
4
*
QK8_1
);
// quant index in block
// Load 4 floats per thread and calculate max. abs. value between them:
const
float4
xi
=
i
x
0
<
kx
0
?
x4
[(
i
x1
*
kx0
+
i
x
0
)
/
4
]
:
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
const
float4
xi
=
i0
<
ne0
0
?
x4
[(
i
03
*
s03
+
i02
*
s02
+
i01
*
s01
+
i
0
0
)
/
4
]
:
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
float
amax
=
fabsf
(
xi
.
x
);
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
y
));
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
z
));
...
...
@@ -77,7 +96,7 @@ static __global__ void quantize_mmq_q8_1(
if
(
ds_layout
!=
MMQ_Q8_1_DS_LAYOUT_D4
)
{
sum
=
xi
.
x
+
xi
.
y
+
xi
.
z
+
xi
.
w
;
//
Exchange c
alculate sum across vals_per_sum/4 threads.
//
C
alculate sum
s
across vals_per_sum/4 threads.
#pragma unroll
for
(
int
offset
=
vals_per_sum
/
8
;
offset
>
0
;
offset
>>=
1
)
{
sum
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
sum
,
offset
,
WARP_SIZE
);
...
...
@@ -127,40 +146,40 @@ static __global__ void quantize_mmq_q8_1(
}
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
kx0_padded
%
QK8_1
==
0
);
const
int64_t
block_num_x
=
(
kx0_padded
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
const
dim3
num_blocks
(
block_num_x
,
kx1
*
channels
,
1
);
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
const
ggml_type
type_src0
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne0
,
const
int64_t
ne1
,
const
int64_t
ne2
,
const
int64_t
ne3
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
!
ids
);
GGML_ASSERT
(
ne0
%
QK8_1
==
0
);
const
int64_t
block_num_x
=
(
ne0
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
const
dim3
num_blocks
(
block_num_x
,
ne1
,
ne2
*
ne3
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE
,
1
,
1
);
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx0_padded
);
GGML_UNUSED
(
type_x
);
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
ne00
,
s01
,
s02
,
s03
,
ne0
,
ne1
,
ne2
);
GGML_UNUSED
(
type_src0
);
}
void
quantize_mmq_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int
64
_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
kx0_padded
%
(
4
*
QK8_1
)
==
0
);
const
float
*
x
,
const
int
32
_t
*
ids
,
void
*
vy
,
const
ggml_type
type_src0
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne0
,
const
int64_t
ne1
,
const
int64_t
ne2
,
const
int64_t
ne3
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ne0
%
(
4
*
QK8_1
)
==
0
);
const
int64_t
block_num_x
=
(
kx0_padded
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
const
dim3
num_blocks
(
block_num_x
,
kx
1
,
channels
);
const
int64_t
block_num_x
=
(
ne0
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
const
dim3
num_blocks
(
block_num_x
,
ne
1
,
ne2
*
ne3
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
,
1
,
1
);
switch
(
mmq_get_q8_1_ds_layout
(
type_
x
))
{
switch
(
mmq_get_q8_1_ds_layout
(
type_
src0
))
{
case
MMQ_Q8_1_DS_LAYOUT_D4
:
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_D4
>
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx
0
,
kx
1
,
kx0_padded
);
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
ids
,
vy
,
ne0
0
,
s0
1
,
s02
,
s03
,
ne0
,
ne1
,
ne2
);
break
;
case
MMQ_Q8_1_DS_LAYOUT_DS4
:
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_DS4
>
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx
0
,
kx
1
,
kx0_padded
);
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
ids
,
vy
,
ne0
0
,
s0
1
,
s02
,
s03
,
ne0
,
ne1
,
ne2
);
break
;
case
MMQ_Q8_1_DS_LAYOUT_D2S6
:
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_D2S6
>
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx
0
,
kx
1
,
kx0_padded
);
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
ids
,
vy
,
ne0
0
,
s0
1
,
s02
,
s03
,
ne0
,
ne1
,
ne2
);
break
;
default:
GGML_ABORT
(
"fatal error"
);
...
...
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
View file @
20c5fd39
...
...
@@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk
static_assert
(
MATRIX_ROW_PADDING
%
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
)
==
0
,
"Risk of out-of-bounds access."
);
typedef
void
(
*
quantize_cuda_t
)(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
);
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
ggml_type
type_src0
,
int64_t
ne00
,
int64_t
s01
,
int64_t
s02
,
int64_t
s03
,
int64_t
ne0
,
int64_t
ne1
,
int64_t
ne2
,
int64_t
ne3
,
cudaStream_t
stream
);
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
);
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
ggml_type
type_src0
,
int64_t
ne00
,
int64_t
s01
,
int64_t
s02
,
int64_t
s03
,
int64_t
ne0
,
int64_t
ne1
,
int64_t
ne2
,
int64_t
ne3
,
cudaStream_t
stream
);
void
quantize_mmq_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
);
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
ggml_type
type_src0
,
int64_t
ne00
,
int64_t
s01
,
int64_t
s02
,
int64_t
s03
,
int64_t
ne0
,
int64_t
ne1
,
int64_t
ne2
,
int64_t
ne3
,
cudaStream_t
stream
);
ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh
View file @
20c5fd39
#pragma once
#include "common.cuh"
#include <cstdint>
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
View file @
20c5fd39
...
...
@@ -5690,7 +5690,7 @@ kernel void kernel_flash_attn_ext(
{
float S[Q] = { [0 ... Q-1] = 0.0f };
float M[Q] = { [0 ... Q-1] = -__FLT
16
_MAX__/2 };
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
// thread indices inside the simdgroup
// TODO: see if we can utilize quad-group functions for better performance
...
...
@@ -5950,7 +5950,7 @@ kernel void kernel_flash_attn_ext(
// reduce the warps sequentially
for (ushort sg = 1; sg < nsg; ++sg) {
float S = { 0.0f };
float M = { -__FLT
16
_MAX__/2 };
float M = { -__FLT_MAX__/2 };
threadgroup_barrier(mem_flags::mem_threadgroup);
...
...
@@ -6197,7 +6197,7 @@ kernel void kernel_flash_attn_ext_vec(
{
float S = 0.0f;
float M = -__FLT
16
_MAX__/2;
float M = -__FLT_MAX__/2;
// thread indices inside the simdgroup
const short tx = tiisg%NL;
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
View file @
20c5fd39
...
...
@@ -44,8 +44,8 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
// note: assumes single GPU device - the default one
// TODO: support multiple GPU devices
static
struct
ggml_backend_metal_device_context
{
id
<
MTLDevice
>
mtl_device
;
int
mtl_device_ref_count
;
id
<
MTLDevice
>
mtl_device
;
int
mtl_device_ref_count
;
id
<
MTLLibrary
>
mtl_library
;
bool
has_simdgroup_reduction
;
...
...
@@ -491,7 +491,259 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_COUNT
};
//
// ggml_metal_heap
//
struct
ggml_metal_heap
{
// number of times the heap was unused
int
n_unused
;
// total number of buffer allocations in this heap across all computes
int64_t
n_alloc
;
// current offset in the heap - we reset this after each node in order to reuse the memory
size_t
offs
;
// the currently allocated MTLBuffer objects in this heap
id
<
MTLHeap
>
obj
;
NSMutableArray
*
bufs
;
};
static
struct
ggml_metal_heap
*
ggml_metal_heap_init
(
id
<
MTLDevice
>
device
,
size_t
size
)
{
struct
ggml_metal_heap
*
heap
=
calloc
(
1
,
sizeof
(
struct
ggml_metal_heap
));
MTLHeapDescriptor
*
desc
=
[[
MTLHeapDescriptor
alloc
]
init
];
desc
.
storageMode
=
MTLStorageModePrivate
;
desc
.
cpuCacheMode
=
MTLCPUCacheModeDefaultCache
;
desc
.
type
=
MTLHeapTypePlacement
;
desc
.
size
=
size
;
heap
->
n_unused
=
0
;
heap
->
n_alloc
=
0
;
heap
->
obj
=
[
device
newHeapWithDescriptor
:
desc
];
if
(
!
heap
->
obj
)
{
GGML_LOG_ERROR
(
"%s: error: failed to create MTLHeap with size %zu
\n
"
,
__func__
,
size
);
free
(
heap
);
return
false
;
}
[
desc
release
];
heap
->
bufs
=
[[
NSMutableArray
alloc
]
init
];
return
heap
;
}
static
void
ggml_metal_heap_reset
(
struct
ggml_metal_heap
*
heap
)
{
heap
->
offs
=
0
;
// count how many graph computes the heap ended up being unused
if
([
heap
->
bufs
count
]
>
0
)
{
heap
->
n_unused
=
0
;
}
else
{
heap
->
n_unused
++
;
}
for
(
id
<
MTLBuffer
>
buf
in
heap
->
bufs
)
{
[
buf
release
];
}
[
heap
->
bufs
removeAllObjects
];
// tell the OS that it can reuse this memory if needed
// ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
[
heap
->
obj
setPurgeableState
:
MTLPurgeableStateVolatile
];
}
static
void
ggml_metal_heap_free
(
struct
ggml_metal_heap
*
heap
)
{
if
(
heap
==
nil
)
{
return
;
}
ggml_metal_heap_reset
(
heap
);
[
heap
->
obj
release
];
[
heap
->
bufs
release
];
free
(
heap
);
}
@interface
ggml_metal_heap_ptr
:
NSObject
@property
(
nonatomic
,
assign
)
struct
ggml_metal_heap
*
data
;
@end
@implementation
ggml_metal_heap_ptr
@end
//
// ggml_metal_mem_pool
//
struct
ggml_metal_mem_pool
{
id
<
MTLDevice
>
device
;
int
n_heaps
;
// total number of heaps ever created (including those that were removed)
NSMutableArray
*
heaps
;
NSMutableArray
*
heaps_to_remove
;
};
static
struct
ggml_metal_mem_pool
*
ggml_metal_mem_pool_init
(
void
)
{
struct
ggml_metal_mem_pool
*
mem_pool
=
calloc
(
1
,
sizeof
(
struct
ggml_metal_mem_pool
));
mem_pool
->
n_heaps
=
0
;
mem_pool
->
heaps
=
[[
NSMutableArray
alloc
]
init
];
mem_pool
->
heaps_to_remove
=
[[
NSMutableArray
alloc
]
init
];
return
mem_pool
;
}
static
void
ggml_metal_mem_pool_free
(
struct
ggml_metal_mem_pool
*
mem_pool
)
{
GGML_LOG_DEBUG
(
"%s: freeing memory pool, num heaps = %zu (total = %d)
\n
"
,
__func__
,
[
mem_pool
->
heaps
count
],
mem_pool
->
n_heaps
);
size_t
size_all
=
0
;
size_t
size_cur
=
0
;
for
(
ggml_metal_heap_ptr
*
ptr
in
mem_pool
->
heaps
)
{
GGML_LOG_DEBUG
(
"%s: heap: %p
\n
"
,
__func__
,
(
void
*
)
ptr
.
data
);
GGML_LOG_DEBUG
(
"%s: n_alloc: %"
PRId64
"
\n
"
,
__func__
,
ptr
.
data
->
n_alloc
);
GGML_LOG_DEBUG
(
"%s: n_unused: %d
\n
"
,
__func__
,
ptr
.
data
->
n_unused
);
GGML_LOG_DEBUG
(
"%s: size: %.2f MiB
\n
"
,
__func__
,
[
ptr
.
data
->
obj
size
]
/
1024
.
0
/
1024
.
0
);
GGML_LOG_DEBUG
(
"%s: bufs: %zu
\n
"
,
__func__
,
[
ptr
.
data
->
bufs
count
]);
if
([
ptr
.
data
->
bufs
count
]
>
0
)
{
size_cur
+=
[
ptr
.
data
->
obj
size
];
}
size_all
+=
[
ptr
.
data
->
obj
size
];
ggml_metal_heap_free
(
ptr
.
data
);
[
ptr
release
];
}
[
mem_pool
->
heaps
release
];
[
mem_pool
->
heaps_to_remove
release
];
if
(
size_all
>
0
)
{
GGML_LOG_DEBUG
(
"%s: size_all: %.2f MiB
\n
"
,
__func__
,
size_all
/
1024
.
0
/
1024
.
0
);
GGML_LOG_DEBUG
(
"%s: size_cur: %.2f MiB
\n
"
,
__func__
,
size_cur
/
1024
.
0
/
1024
.
0
);
}
free
(
mem_pool
);
}
static
void
ggml_metal_mem_pool_reset
(
struct
ggml_metal_mem_pool
*
mem_pool
)
{
for
(
NSUInteger
i
=
0
;
i
<
[
mem_pool
->
heaps
count
];
i
++
)
{
ggml_metal_heap_ptr
*
ptr
=
[
mem_pool
->
heaps
objectAtIndex
:
i
];
struct
ggml_metal_heap
*
heap
=
ptr
.
data
;
ggml_metal_heap_reset
(
heap
);
// if the heap hasn't been used for a while, remove it
if
(
heap
->
n_unused
>=
128
)
{
[
mem_pool
->
heaps_to_remove
addObject
:
@
(
i
)];
}
}
if
(
mem_pool
->
heaps_to_remove
.
count
>
0
)
{
for
(
NSUInteger
i
=
0
;
i
<
[
mem_pool
->
heaps_to_remove
count
];
i
++
)
{
NSUInteger
index
=
[[
mem_pool
->
heaps_to_remove
objectAtIndex
:
i
]
intValue
];
ggml_metal_heap_ptr
*
ptr
=
[
mem_pool
->
heaps
objectAtIndex
:
index
];
struct
ggml_metal_heap
*
heap
=
ptr
.
data
;
ggml_metal_heap_free
(
heap
);
[
mem_pool
->
heaps
removeObjectAtIndex
:
index
];
[
ptr
release
];
}
[
mem_pool
->
heaps_to_remove
removeAllObjects
];
}
}
static
void
ggml_metal_mem_pool_clear
(
struct
ggml_metal_mem_pool
*
mem_pool
)
{
for
(
ggml_metal_heap_ptr
*
ptr
in
mem_pool
->
heaps
)
{
ptr
.
data
->
offs
=
0
;
}
}
static
id
<
MTLBuffer
>
ggml_metal_mem_pool_alloc
(
struct
ggml_metal_mem_pool
*
mem_pool
,
size_t
size
)
{
const
size_t
alignment
=
32
;
const
size_t
size_aligned
=
GGML_PAD
(
size
,
alignment
);
// try one of the existing heaps
for
(
ggml_metal_heap_ptr
*
ptr
in
mem_pool
->
heaps
)
{
struct
ggml_metal_heap
*
heap
=
ptr
.
data
;
if
(
heap
->
offs
+
size_aligned
<=
[
heap
->
obj
size
])
{
// if this is the first buffer in the heap for the current command buffer, tell the OS that
// it cannot free the memory used by the heap
// ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
if
([
heap
->
bufs
count
]
==
0
)
{
[
heap
->
obj
setPurgeableState
:
MTLPurgeableStateNonVolatile
];
}
id
<
MTLBuffer
>
buf
=
[
heap
->
obj
newBufferWithLength
:
size_aligned
options
:
MTLResourceStorageModePrivate
offset
:
heap
->
offs
];
if
(
buf
==
nil
)
{
GGML_LOG_ERROR
(
"%s: error: failed to create MTLBuffer with size %zu
\n
"
,
__func__
,
size_aligned
);
return
nil
;
}
heap
->
n_alloc
++
;
heap
->
offs
+=
size_aligned
;
[
heap
->
bufs
addObject
:
buf
];
return
buf
;
}
}
// create a new heap that can fit this buffer
ggml_metal_heap_ptr
*
heap_ptr
=
[
ggml_metal_heap_ptr
new
];
struct
ggml_metal_heap
*
heap
=
ggml_metal_heap_init
(
mem_pool
->
device
,
size_aligned
);
if
(
heap
==
NULL
)
{
GGML_LOG_ERROR
(
"%s: error: failed to create heap of size %zu
\n
"
,
__func__
,
size_aligned
);
return
NULL
;
}
//GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
heap_ptr
.
data
=
heap
;
ggml_metal_heap_reset
(
heap
);
[
heap
->
obj
setPurgeableState
:
MTLPurgeableStateNonVolatile
];
id
<
MTLBuffer
>
buf
=
[
heap
->
obj
newBufferWithLength
:
size_aligned
options
:
MTLResourceStorageModePrivate
offset
:
heap
->
offs
];
if
(
buf
==
nil
)
{
GGML_LOG_ERROR
(
"%s: error: failed to create MTLBuffer with size %zu
\n
"
,
__func__
,
size_aligned
);
return
NULL
;
}
heap
->
n_alloc
++
;
heap
->
offs
+=
size_aligned
;
[
heap
->
bufs
addObject
:
buf
];
[
mem_pool
->
heaps
addObject
:
heap_ptr
];
mem_pool
->
n_heaps
++
;
return
buf
;
}
struct
ggml_metal_command_buffer
{
id
<
MTLCommandBuffer
>
obj
;
// each command buffer has a memory pool from which it can allocate temporary buffers during the compute
struct
ggml_metal_mem_pool
*
mem_pool
;
};
struct
ggml_backend_metal_context
{
id
<
MTLDevice
>
device
;
id
<
MTLCommandQueue
>
queue
;
dispatch_queue_t
d_queue
;
...
...
@@ -516,7 +768,7 @@ struct ggml_backend_metal_context {
void
(
^
encode_async
)(
size_t
ith
);
// n_cb command buffers + 1 used by the main thread
id
<
MTLC
ommand
B
uffer
>
c
omman
d_buf
fer
s
[
GGML_METAL_MAX_COMMAND_BUFFERS
+
1
];
struct
ggml_metal_c
ommand
_b
uffer
c
m
d_bufs
[
GGML_METAL_MAX_COMMAND_BUFFERS
+
1
];
// abort ggml_metal_graph_compute if callback returns true
ggml_abort_callback
abort_callback
;
...
...
@@ -706,9 +958,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
struct
ggml_backend_metal_device_context
*
ctx_dev
=
dev
->
context
;
id
<
MTLDevice
>
device
=
ggml_backend_metal_device_acq
(
ctx_dev
);
GGML_LOG_INFO
(
"%s: picking default device: %s
\n
"
,
__func__
,
[[
device
name
]
UTF8String
]);
ctx
->
queue
=
[
device
newCommandQueue
];
ctx
->
device
=
device
;
ctx
->
queue
=
[
device
newCommandQueue
];
if
(
ctx
->
queue
==
nil
)
{
GGML_LOG_ERROR
(
"%s: error: failed to create command queue
\n
"
,
__func__
);
return
NULL
;
...
...
@@ -769,7 +1023,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
ctx
->
gf
=
nil
;
ctx
->
encode_async
=
nil
;
for
(
int
i
=
0
;
i
<
GGML_METAL_MAX_COMMAND_BUFFERS
;
++
i
)
{
ctx
->
command_buffers
[
i
]
=
nil
;
ctx
->
cmd_bufs
[
i
].
obj
=
nil
;
ctx
->
cmd_bufs
[
i
].
mem_pool
=
ggml_metal_mem_pool_init
();
ctx
->
cmd_bufs
[
i
].
mem_pool
->
device
=
device
;
}
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
...
...
@@ -1183,6 +1440,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
[
ctx
->
queue
release
];
for
(
int
i
=
0
;
i
<
GGML_METAL_MAX_COMMAND_BUFFERS
;
++
i
)
{
// ctx->cmd_bufs[i].obj is auto released
ggml_metal_mem_pool_free
(
ctx
->
cmd_bufs
[
i
].
mem_pool
);
}
dispatch_release
(
ctx
->
d_queue
);
free
(
ctx
);
...
...
@@ -1489,10 +1752,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
}
}
static
void
ggml_metal_encode_node
(
static
bool
ggml_metal_encode_node
(
ggml_backend_t
backend
,
int
idx
,
id
<
MTLComputeCommandEncoder
>
encoder
)
{
id
<
MTLComputeCommandEncoder
>
encoder
,
struct
ggml_metal_mem_pool
*
mem_pool
)
{
struct
ggml_backend_metal_context
*
ctx
=
backend
->
context
;
struct
ggml_backend_metal_device_context
*
ctx_dev
=
backend
->
device
->
context
;
...
...
@@ -1508,7 +1772,7 @@ static void ggml_metal_encode_node(
struct
ggml_tensor
*
dst
=
node
;
if
(
ggml_is_empty
(
dst
))
{
return
;
return
true
;
}
switch
(
dst
->
op
)
{
...
...
@@ -1519,7 +1783,7 @@ static void ggml_metal_encode_node(
case
GGML_OP_PERMUTE
:
{
// noop -> next node
}
return
;
}
return
true
;
default:
{
}
break
;
...
...
@@ -1530,6 +1794,8 @@ static void ggml_metal_encode_node(
GGML_ABORT
(
"unsupported op"
);
}
ggml_metal_mem_pool_clear
(
mem_pool
);
const
int64_t
ne00
=
src0
?
src0
->
ne
[
0
]
:
0
;
const
int64_t
ne01
=
src0
?
src0
->
ne
[
1
]
:
0
;
const
int64_t
ne02
=
src0
?
src0
->
ne
[
2
]
:
0
;
...
...
@@ -2176,26 +2442,76 @@ static void ggml_metal_encode_node(
const
float
m0
=
powf
(
2
.
0
f
,
-
(
max_bias
)
/
n_head_log2
);
const
float
m1
=
powf
(
2
.
0
f
,
-
(
max_bias
/
2
.
0
f
)
/
n_head_log2
);
ggml_metal_kargs_soft_max
args
=
{
// use this branch to test the ggml_metal_mem_pool functionality
#if 0
// cpy to tmp buffer in MTLHeap
id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
if (!h_src0) {
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
return false;
}
offs_src0 = 0;
ggml_metal_kargs_cpy args_cpy = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.scale =*/
scale
,
/*.max_bias =*/
max_bias
,
/*.m0 =*/
m0
,
/*.m1 =*/
m1
,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne0 =*/ ne00,
/*.ne1 =*/ ne01,
/*.ne2 =*/ ne02,
/*.ne3 =*/ ne03,
/*.nb0 =*/ nb00,
/*.nb1 =*/ nb01,
/*.nb2 =*/ nb02,
/*.nb3 =*/ nb03,
};
if (src0->type == GGML_TYPE_F16) {
[encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
} else {
[encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline];
}
[encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:h_src0 offset:0 atIndex:2];
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type));
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)];
#else
id
<
MTLBuffer
>
h_src0
=
id_src0
;
#endif
// softmax
ggml_metal_kargs_soft_max
args
=
{
/*.ne00 =*/
ne00
,
/*.ne01 =*/
ne01
,
/*.ne02 =*/
ne02
,
/*.scale =*/
scale
,
/*.max_bias =*/
max_bias
,
/*.m0 =*/
m0
,
/*.m1 =*/
m1
,
/*.n_head_log2 =*/
n_head_log2
,
};
[
encoder
setComputePipelineState
:
pipeline
];
[
encoder
setBuffer
:
id
_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
h
_src0
offset
:
offs_src0
atIndex
:
0
];
if
(
id_src1
)
{
[
encoder
setBuffer
:
id_src1
offset
:
offs_src1
atIndex
:
1
];
[
encoder
setBuffer
:
id_src1
offset
:
offs_src1
atIndex
:
1
];
}
else
{
[
encoder
setBuffer
:
id
_src0
offset
:
offs_src0
atIndex
:
1
];
[
encoder
setBuffer
:
h
_src0
offset
:
offs_src0
atIndex
:
1
];
}
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
2
];
[
encoder
setBytes
:
&
args
length
:
sizeof
(
args
)
atIndex
:
3
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
2
];
[
encoder
setBytes
:
&
args
length
:
sizeof
(
args
)
atIndex
:
3
];
[
encoder
setThreadgroupMemoryLength
:
32
*
sizeof
(
float
)
atIndex
:
0
];
...
...
@@ -4634,6 +4950,8 @@ static void ggml_metal_encode_node(
GGML_ABORT
(
"fatal error"
);
}
}
return
true
;
}
static
enum
ggml_status
ggml_metal_graph_compute
(
...
...
@@ -4687,25 +5005,25 @@ static enum ggml_status ggml_metal_graph_compute(
}
// the main thread commits the first few commands immediately
// c
omman
d_buf
fer
[n_cb]
// c
m
d_buf[n_cb]
{
id
<
MTLCommandBuffer
>
c
omman
d_buf
fer
=
[
ctx
->
queue
commandBufferWithUnretainedReferences
];
ctx
->
c
omman
d_buf
fer
s
[
n_cb
]
=
comman
d_buf
fer
;
id
<
MTLCommandBuffer
>
c
m
d_buf
=
[
ctx
->
queue
commandBufferWithUnretainedReferences
];
ctx
->
c
m
d_bufs
[
n_cb
]
.
obj
=
cm
d_buf
;
[
c
omman
d_buf
fer
enqueue
];
[
c
m
d_buf
enqueue
];
ctx
->
encode_async
(
n_cb
);
}
// prepare the rest of the command buffers asynchronously
// c
omman
d_buf
fer
[0.. n_cb)
// c
m
d_buf[0.. n_cb)
for
(
int
cb_idx
=
0
;
cb_idx
<
n_cb
;
++
cb_idx
)
{
id
<
MTLCommandBuffer
>
c
omman
d_buf
fer
=
[
ctx
->
queue
commandBufferWithUnretainedReferences
];
ctx
->
c
omman
d_buf
fer
s
[
cb_idx
]
=
comman
d_buf
fer
;
id
<
MTLCommandBuffer
>
c
m
d_buf
=
[
ctx
->
queue
commandBufferWithUnretainedReferences
];
ctx
->
c
m
d_bufs
[
cb_idx
]
.
obj
=
cm
d_buf
;
// always enqueue the first two command buffers
// enqueue all of the command buffers if we don't need to abort
if
(
cb_idx
<
2
||
ctx
->
abort_callback
==
NULL
)
{
[
c
omman
d_buf
fer
enqueue
];
[
c
m
d_buf
enqueue
];
}
}
...
...
@@ -4714,14 +5032,14 @@ static enum ggml_status ggml_metal_graph_compute(
// wait for completion and check status of each command buffer
// needed to detect if the device ran out-of-memory for example (#1881)
{
id
<
MTLCommandBuffer
>
c
omman
d_buf
fer
=
ctx
->
c
omman
d_buf
fer
s
[
n_cb
];
[
c
omman
d_buf
fer
waitUntilCompleted
];
id
<
MTLCommandBuffer
>
c
m
d_buf
=
ctx
->
c
m
d_bufs
[
n_cb
]
.
obj
;
[
c
m
d_buf
waitUntilCompleted
];
MTLCommandBufferStatus
status
=
[
c
omman
d_buf
fer
status
];
MTLCommandBufferStatus
status
=
[
c
m
d_buf
status
];
if
(
status
!=
MTLCommandBufferStatusCompleted
)
{
GGML_LOG_INFO
(
"%s: command buffer %d failed with status %lu
\n
"
,
__func__
,
n_cb
,
status
);
if
(
status
==
MTLCommandBufferStatusError
)
{
GGML_LOG_INFO
(
"error: %s
\n
"
,
[[
c
omman
d_buf
fer
error
].
localizedDescription
UTF8String
]);
GGML_LOG_INFO
(
"error: %s
\n
"
,
[[
c
m
d_buf
error
].
localizedDescription
UTF8String
]);
}
return
GGML_STATUS_FAILED
;
...
...
@@ -4729,20 +5047,20 @@ static enum ggml_status ggml_metal_graph_compute(
}
for
(
int
i
=
0
;
i
<
n_cb
;
++
i
)
{
id
<
MTLCommandBuffer
>
c
omman
d_buf
fer
=
ctx
->
c
omman
d_buf
fer
s
[
i
];
[
c
omman
d_buf
fer
waitUntilCompleted
];
id
<
MTLCommandBuffer
>
c
m
d_buf
=
ctx
->
c
m
d_bufs
[
i
]
.
obj
;
[
c
m
d_buf
waitUntilCompleted
];
MTLCommandBufferStatus
status
=
[
c
omman
d_buf
fer
status
];
MTLCommandBufferStatus
status
=
[
c
m
d_buf
status
];
if
(
status
!=
MTLCommandBufferStatusCompleted
)
{
GGML_LOG_INFO
(
"%s: command buffer %d failed with status %lu
\n
"
,
__func__
,
i
,
status
);
if
(
status
==
MTLCommandBufferStatusError
)
{
GGML_LOG_INFO
(
"error: %s
\n
"
,
[[
c
omman
d_buf
fer
error
].
localizedDescription
UTF8String
]);
GGML_LOG_INFO
(
"error: %s
\n
"
,
[[
c
m
d_buf
error
].
localizedDescription
UTF8String
]);
}
return
GGML_STATUS_FAILED
;
}
id
<
MTLCommandBuffer
>
next_buffer
=
(
i
+
1
<
n_cb
?
ctx
->
c
omman
d_buf
fer
s
[
i
+
1
]
:
nil
);
id
<
MTLCommandBuffer
>
next_buffer
=
(
i
+
1
<
n_cb
?
ctx
->
c
m
d_bufs
[
i
+
1
]
.
obj
:
nil
);
if
(
!
next_buffer
)
{
continue
;
}
...
...
@@ -5126,8 +5444,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
const
int
n_nodes_per_cb
=
ctx
->
n_nodes_per_cb
;
id
<
MTLCommandBuffer
>
command_buffer
=
ctx
->
command_buffers
[
cb_idx
];
id
<
MTLComputeCommandEncoder
>
encoder
=
[
command_buffer
computeCommandEncoder
];
id
<
MTLCommandBuffer
>
cmd_buf
=
ctx
->
cmd_bufs
[
cb_idx
].
obj
;
id
<
MTLComputeCommandEncoder
>
encoder
=
[
cmd_buf
computeCommandEncoder
];
int
node_start
=
0
;
int
node_end
=
n_nodes_0
;
...
...
@@ -5139,22 +5458,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
const
bool
should_capture
=
ctx
->
capture_next_compute
;
struct
ggml_metal_mem_pool
*
mem_pool
=
ctx
->
cmd_bufs
[
cb_idx
].
mem_pool
;
ggml_metal_mem_pool_reset
(
mem_pool
);
for
(
int
idx
=
node_start
;
idx
<
node_end
;
++
idx
)
{
if
(
should_capture
)
{
[
encoder
pushDebugGroup
:[
NSString
stringWithCString
:
ggml_op_desc
(
ggml_graph_node
(
ctx
->
gf
,
idx
))
encoding
:
NSUTF8StringEncoding
]];
}
ggml_metal_encode_node
(
backend
,
idx
,
encoder
);
const
bool
res
=
ggml_metal_encode_node
(
backend
,
idx
,
encoder
,
mem_pool
);
if
(
should_capture
)
{
[
encoder
popDebugGroup
];
}
if
(
!
res
)
{
break
;
}
}
[
encoder
endEncoding
];
if
(
cb_idx
<
2
||
ctx
->
abort_callback
==
NULL
)
{
[
c
omman
d_buf
fer
commit
];
[
c
m
d_buf
commit
];
}
});
}
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
View file @
20c5fd39
...
...
@@ -3237,7 +3237,7 @@ kernel void kernel_flash_attn_ext(
{
float S[Q] = { [0 ... Q-1] = 0.0f };
float M[Q] = { [0 ... Q-1] = -__FLT
16
_MAX__/2 };
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
// thread indices inside the simdgroup
// TODO: see if we can utilize quad-group functions for better performance
...
...
@@ -3497,7 +3497,7 @@ kernel void kernel_flash_attn_ext(
// reduce the warps sequentially
for (ushort sg = 1; sg < nsg; ++sg) {
float S = { 0.0f };
float M = { -__FLT
16
_MAX__/2 };
float M = { -__FLT_MAX__/2 };
threadgroup_barrier(mem_flags::mem_threadgroup);
...
...
@@ -3744,7 +3744,7 @@ kernel void kernel_flash_attn_ext_vec(
{
float S = 0.0f;
float M = -__FLT
16
_MAX__/2;
float M = -__FLT_MAX__/2;
// thread indices inside the simdgroup
const short tx = tiisg%NL;
...
...
ml/backend/ggml/ggml/src/ggml.c
View file @
20c5fd39
...
...
@@ -4,6 +4,7 @@
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-threading.h"
#include "ggml-cpu.h"
#include "ggml.h"
// FIXME: required here for quantization functions
...
...
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
}
}
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
// currently, the ggml_cpu_has_* functions are entirely compile-time
void
ggml_fp32_to_fp16_row
(
const
float
*
x
,
ggml_fp16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
//if (ggml_cpu_has_f16c()) {
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m256
x_vec
=
_mm256_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm256_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storeu_si128
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128
x_vec
=
_mm_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storel_epi64
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
//}
#endif
for
(;
i
<
n
;
i
++
)
{
int
i
=
0
;
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_FP16
(
x
[
i
]);
}
}
void
ggml_bf16_to_fp32_row
(
const
ggml_bf16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__AVX512F__)
//if (ggml_cpu_has_avx512()) {
for
(;
i
+
16
<=
n
;
i
+=
16
)
{
_mm512_storeu_ps
(
y
+
i
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
(
(
const
__m256i
*
)(
x
+
i
))),
16
)));
}
//}
#endif
#if defined(__AVX2__)
//if (ggml_cpu_has_avx2()) {
for
(;
i
+
8
<=
n
;
i
+=
8
)
{
_mm256_storeu_ps
(
y
+
i
,
_mm256_castsi256_ps
(
_mm256_slli_epi32
(
_mm256_cvtepu16_epi32
(
_mm_loadu_si128
(
(
const
__m128i
*
)(
x
+
i
))),
16
)));
}
//}
#endif
for
(;
i
<
n
;
i
++
)
{
int
i
=
0
;
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_BF16_TO_FP32
(
x
[
i
]);
}
}
...
...
@@ -956,6 +915,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CONV_TRANSPOSE_1D"
,
"IM2COL"
,
"IM2COL_BACK"
,
"CONV_2D_DW"
,
"CONV_TRANSPOSE_2D"
,
"POOL_1D"
,
"POOL_2D"
,
...
...
@@ -994,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW"
,
};
static_assert
(
GGML_OP_COUNT
==
8
2
,
"GGML_OP_COUNT != 8
2
"
);
static_assert
(
GGML_OP_COUNT
==
8
3
,
"GGML_OP_COUNT != 8
3
"
);
static
const
char
*
GGML_OP_SYMBOL
[
GGML_OP_COUNT
]
=
{
"none"
,
...
...
@@ -1051,6 +1011,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"conv_transpose_1d(x)"
,
"im2col(x)"
,
"im2col_back(x)"
,
"conv_2d_dw(x)"
,
"conv_transpose_2d(x)"
,
"pool_1d(x)"
,
"pool_2d(x)"
,
...
...
@@ -1089,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)"
,
};
static_assert
(
GGML_OP_COUNT
==
8
2
,
"GGML_OP_COUNT != 8
2
"
);
static_assert
(
GGML_OP_COUNT
==
8
3
,
"GGML_OP_COUNT != 8
3
"
);
static_assert
(
GGML_OP_POOL_COUNT
==
2
,
"GGML_OP_POOL_COUNT != 2"
);
...
...
@@ -1346,6 +1307,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
return
tensor
->
nb
[
0
]
>
tensor
->
nb
[
1
]
||
tensor
->
nb
[
1
]
>
tensor
->
nb
[
2
]
||
tensor
->
nb
[
2
]
>
tensor
->
nb
[
3
];
}
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
)
{
return
tensor
->
nb
[
0
]
>
tensor
->
nb
[
2
]
&&
tensor
->
nb
[
1
]
>
tensor
->
nb
[
0
]
&&
tensor
->
nb
[
2
]
==
ggml_type_size
(
tensor
->
type
);
}
static
inline
bool
ggml_is_padded_1d
(
const
struct
ggml_tensor
*
tensor
)
{
static_assert
(
GGML_MAX_DIMS
==
4
,
"GGML_MAX_DIMS is not 4 - update this function"
);
...
...
@@ -4052,6 +4020,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
return
result
;
}
// ggml_conv_2d_dw_direct
struct
ggml_tensor
*
ggml_conv_2d_dw_direct
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
int
stride0
,
int
stride1
,
int
pad0
,
int
pad1
,
int
dilation0
,
int
dilation1
)
{
GGML_ASSERT
(
a
->
ne
[
2
]
==
1
);
GGML_ASSERT
(
a
->
ne
[
3
]
==
b
->
ne
[
2
]);
int64_t
ne
[
4
];
ne
[
0
]
=
ggml_calc_conv_output_size
(
b
->
ne
[
0
],
a
->
ne
[
0
],
stride0
,
pad0
,
dilation0
);
ne
[
1
]
=
ggml_calc_conv_output_size
(
b
->
ne
[
1
],
a
->
ne
[
1
],
stride1
,
pad1
,
dilation1
);
ne
[
2
]
=
b
->
ne
[
2
];
ne
[
3
]
=
b
->
ne
[
3
];
struct
ggml_tensor
*
result
=
ggml_new_tensor
(
ctx
,
b
->
type
,
4
,
ne
);
if
(
ggml_is_contiguous_channels
(
b
))
{
// Result will be permuted the same way as input (CWHN order)
const
int64_t
type_size
=
ggml_type_size
(
result
->
type
);
GGML_ASSERT
(
ggml_blck_size
(
result
->
type
)
==
1
);
result
->
nb
[
0
]
=
result
->
ne
[
2
]
*
type_size
;
result
->
nb
[
1
]
=
result
->
ne
[
0
]
*
result
->
nb
[
0
];
result
->
nb
[
2
]
=
type_size
;
}
int32_t
params
[]
=
{
stride0
,
stride1
,
pad0
,
pad1
,
dilation0
,
dilation1
};
ggml_set_op_params
(
result
,
params
,
sizeof
(
params
));
result
->
op
=
GGML_OP_CONV_2D_DW
;
result
->
src
[
0
]
=
a
;
result
->
src
[
1
]
=
b
;
return
result
;
}
// ggml_conv_transpose_2d_p0
static
int64_t
ggml_calc_conv_transpose_output_size
(
int64_t
ins
,
int64_t
ks
,
int
s
,
int
p
)
{
...
...
ml/backend/ggml/ggml/src/ggml.go
View file @
20c5fd39
...
...
@@ -3,6 +3,7 @@ package ggml
// #cgo CXXFLAGS: -std=c++17
// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
// #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu
// #cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
// #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
// #include <stdlib.h>
// #include "ggml-backend.h"
...
...
@@ -57,26 +58,20 @@ var OnceLoad = sync.OnceFunc(func() {
exe
=
"."
}
// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
// set by the parent process, however, use a default value
// if the environment variable is not set.
var
name
,
value
string
var
value
string
switch
runtime
.
GOOS
{
case
"darwin"
:
// On macOS, DYLD_LIBRARY_PATH is often not set, so
// we use the directory of the executable as the default.
name
=
"DYLD_LIBRARY_PATH"
value
=
filepath
.
Dir
(
exe
)
case
"windows"
:
name
=
"PATH"
value
=
filepath
.
Join
(
filepath
.
Dir
(
exe
),
"lib"
,
"ollama"
)
default
:
name
=
"LD_LIBRARY_PATH"
value
=
filepath
.
Join
(
filepath
.
Dir
(
exe
),
".."
,
"lib"
,
"ollama"
)
}
paths
,
ok
:=
os
.
LookupEnv
(
name
)
// Avoid potentially loading incompatible GGML libraries
paths
,
ok
:=
os
.
LookupEnv
(
"OLLAMA_LIBRARY_PATH"
)
if
!
ok
{
slog
.
Debug
(
"OLLAMA_LIBRARY_PATH not set, falling back to default"
,
"search"
,
value
)
paths
=
value
}
...
...
ml/backend/ggml/quantization.go
0 → 100644
View file @
20c5fd39
package
ggml
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src
// #include <stdlib.h>
// #include <stdint.h>
// #include "ggml.h"
// #include "ggml-cpu.h"
// #include "ggml-backend.h"
// #include "ggml-quants.h"
import
"C"
import
(
"unsafe"
fsggml
"github.com/ollama/ollama/fs/ggml"
)
// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it
func
ConvertToF32
(
data
[]
byte
,
dtype
uint32
,
nelements
uint64
)
[]
float32
{
f32s
:=
make
([]
float32
,
nelements
)
elems
:=
C
.
int64_t
(
nelements
)
switch
dtype
{
case
C
.
GGML_TYPE_F16
:
C
.
ggml_fp16_to_fp32_row
((
*
C
.
uint16_t
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q4_0
:
C
.
dequantize_row_q4_0
((
*
C
.
block_q4_0
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q4_1
:
C
.
dequantize_row_q4_1
((
*
C
.
block_q4_1
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q5_0
:
C
.
dequantize_row_q5_0
((
*
C
.
block_q5_0
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q5_1
:
C
.
dequantize_row_q5_1
((
*
C
.
block_q5_1
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q8_0
:
C
.
dequantize_row_q8_0
((
*
C
.
block_q8_0
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q2_K
:
C
.
dequantize_row_q2_K
((
*
C
.
block_q2_K
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q3_K
:
C
.
dequantize_row_q3_K
((
*
C
.
block_q3_K
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q4_K
:
C
.
dequantize_row_q4_K
((
*
C
.
block_q4_K
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q5_K
:
C
.
dequantize_row_q5_K
((
*
C
.
block_q5_K
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_Q6_K
:
C
.
dequantize_row_q6_K
((
*
C
.
block_q6_K
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
case
C
.
GGML_TYPE_BF16
:
C
.
ggml_bf16_to_fp32_row
((
*
C
.
ggml_bf16_t
)(
unsafe
.
Pointer
(
&
data
[
0
])),
(
*
C
.
float
)(
&
f32s
[
0
]),
elems
)
default
:
panic
(
"unsupported quantization format"
)
}
return
f32s
}
func
Quantize
(
newType
fsggml
.
TensorType
,
f32s
[]
float32
,
shape
[]
uint64
)
[]
byte
{
buf
:=
make
([]
byte
,
len
(
f32s
)
*
4
)
// upper bound on size
nPerRow
:=
C
.
int64_t
(
shape
[
0
])
nrows
:=
C
.
int64_t
(
1
)
if
len
(
shape
)
>
1
{
nrows
=
C
.
int64_t
(
shape
[
1
])
}
shape2
:=
C
.
int64_t
(
1
)
if
len
(
shape
)
>
2
{
shape2
=
C
.
int64_t
(
shape
[
2
])
}
nelements_matrix
:=
nPerRow
*
nrows
newSize
:=
C
.
size_t
(
0
)
for
i03
:=
C
.
int64_t
(
0
);
i03
<
shape2
;
i03
++
{
f32s_03
:=
i03
*
nelements_matrix
buf_03
:=
C
.
int64_t
(
C
.
ggml_row_size
(
uint32
(
newType
),
nPerRow
))
*
i03
*
nrows
newSize
+=
C
.
ggml_quantize_chunk
(
uint32
(
newType
),
(
*
C
.
float
)(
&
f32s
[
f32s_03
]),
unsafe
.
Pointer
((
uintptr
)(
unsafe
.
Pointer
(
&
buf
[
0
]))
+
uintptr
(
buf_03
)),
0
,
nrows
,
nPerRow
,
nil
)
}
return
buf
[
:
newSize
]
}
func
QuantizationVersion
()
uint32
{
return
uint32
(
C
.
GGML_QNT_VERSION
)
}
parser/parser.go
View file @
20c5fd39
...
...
@@ -39,7 +39,17 @@ func (f Modelfile) String() string {
return
sb
.
String
()
}
var
deprecatedParameters
=
[]
string
{
"penalize_newline"
}
var
deprecatedParameters
=
[]
string
{
"penalize_newline"
,
"low_vram"
,
"f16_kv"
,
"logits_all"
,
"vocab_only"
,
"use_mlock"
,
"mirostat"
,
"mirostat_tau"
,
"mirostat_eta"
,
}
// CreateRequest creates a new *api.CreateRequest from an existing Modelfile
func
(
f
Modelfile
)
CreateRequest
(
relativeDir
string
)
(
*
api
.
CreateRequest
,
error
)
{
...
...
@@ -139,10 +149,28 @@ func fileDigestMap(path string) (map[string]string, error) {
var
files
[]
string
if
fi
.
IsDir
()
{
f
ile
s
,
err
=
filesForModel
(
path
)
fs
,
err
:
=
filesForModel
(
path
)
if
err
!=
nil
{
return
nil
,
err
}
for
_
,
f
:=
range
fs
{
f
,
err
:=
filepath
.
EvalSymlinks
(
f
)
if
err
!=
nil
{
return
nil
,
err
}
rel
,
err
:=
filepath
.
Rel
(
path
,
f
)
if
err
!=
nil
{
return
nil
,
err
}
if
!
filepath
.
IsLocal
(
rel
)
{
return
nil
,
fmt
.
Errorf
(
"insecure path: %s"
,
rel
)
}
files
=
append
(
files
,
f
)
}
}
else
{
files
=
[]
string
{
path
}
}
...
...
@@ -215,11 +243,11 @@ func filesForModel(path string) ([]string, error) {
return
nil
,
err
}
for
_
,
safetensor
:=
range
matches
{
if
ct
,
err
:=
detectContentType
(
safetensor
);
err
!=
nil
{
for
_
,
match
:=
range
matches
{
if
ct
,
err
:=
detectContentType
(
match
);
err
!=
nil
{
return
nil
,
err
}
else
if
ct
!=
contentType
{
return
nil
,
fmt
.
Errorf
(
"invalid content type: expected %s for %s"
,
ct
,
safetensor
)
return
nil
,
fmt
.
Errorf
(
"invalid content type: expected %s for %s"
,
ct
,
match
)
}
}
...
...
parser/parser_test.go
View file @
20c5fd39
...
...
@@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
"num_gqa 1"
:
{
"num_gqa"
,
"1"
},
"num_gpu 1"
:
{
"num_gpu"
,
"1"
},
"main_gpu 1"
:
{
"main_gpu"
,
"1"
},
"low_vram true"
:
{
"low_vram"
,
"true"
},
"logits_all true"
:
{
"logits_all"
,
"true"
},
"vocab_only true"
:
{
"vocab_only"
,
"true"
},
"use_mmap true"
:
{
"use_mmap"
,
"true"
},
"use_mlock true"
:
{
"use_mlock"
,
"true"
},
"num_thread 1"
:
{
"num_thread"
,
"1"
},
"num_keep 1"
:
{
"num_keep"
,
"1"
},
"seed 1"
:
{
"seed"
,
"1"
},
...
...
@@ -496,9 +492,6 @@ func TestParseFileParameters(t *testing.T) {
"repeat_penalty 1.0"
:
{
"repeat_penalty"
,
"1.0"
},
"presence_penalty 1.0"
:
{
"presence_penalty"
,
"1.0"
},
"frequency_penalty 1.0"
:
{
"frequency_penalty"
,
"1.0"
},
"mirostat 1"
:
{
"mirostat"
,
"1"
},
"mirostat_tau 1.0"
:
{
"mirostat_tau"
,
"1.0"
},
"mirostat_eta 1.0"
:
{
"mirostat_eta"
,
"1.0"
},
"penalize_newline true"
:
{
"penalize_newline"
,
"true"
},
"stop ### User:"
:
{
"stop"
,
"### User:"
},
"stop ### User: "
:
{
"stop"
,
"### User:"
},
...
...
@@ -769,7 +762,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
return
fmt
.
Sprintf
(
"sha256:%x"
,
h
.
Sum
(
nil
)),
n
}
func
createBinFile
(
t
*
testing
.
T
,
kv
map
[
string
]
any
,
ti
[]
ggml
.
Tensor
)
(
string
,
string
)
{
func
createBinFile
(
t
*
testing
.
T
,
kv
map
[
string
]
any
,
ti
[]
*
ggml
.
Tensor
)
(
string
,
string
)
{
t
.
Helper
()
f
,
err
:=
os
.
CreateTemp
(
t
.
TempDir
(),
"testbin.*.gguf"
)
...
...
progress/bar.go
View file @
20c5fd39
...
...
@@ -64,7 +64,7 @@ func formatDuration(d time.Duration) string {
func
(
b
*
Bar
)
String
()
string
{
termWidth
,
_
,
err
:=
term
.
GetSize
(
int
(
os
.
Stderr
.
Fd
()))
if
err
!=
nil
{
termWidth
=
80
termWidth
=
defaultTermWidth
}
var
pre
strings
.
Builder
...
...
progress/progress.go
View file @
20c5fd39
...
...
@@ -4,8 +4,16 @@ import (
"bufio"
"fmt"
"io"
"os"
"sync"
"time"
"golang.org/x/term"
)
const
(
defaultTermWidth
=
80
defaultTermHeight
=
24
)
type
State
interface
{
...
...
@@ -83,6 +91,11 @@ func (p *Progress) Add(key string, state State) {
}
func
(
p
*
Progress
)
render
()
{
_
,
termHeight
,
err
:=
term
.
GetSize
(
int
(
os
.
Stderr
.
Fd
()))
if
err
!=
nil
{
termHeight
=
defaultTermHeight
}
p
.
mu
.
Lock
()
defer
p
.
mu
.
Unlock
()
...
...
@@ -102,8 +115,9 @@ func (p *Progress) render() {
fmt
.
Fprint
(
p
.
w
,
"
\0
33[1G"
)
// render progress lines
for
i
,
state
:=
range
p
.
states
{
fmt
.
Fprint
(
p
.
w
,
state
.
String
(),
"
\0
33[K"
)
maxHeight
:=
min
(
len
(
p
.
states
),
termHeight
)
for
i
:=
len
(
p
.
states
)
-
maxHeight
;
i
<
len
(
p
.
states
);
i
++
{
fmt
.
Fprint
(
p
.
w
,
p
.
states
[
i
]
.
String
(),
"
\0
33[K"
)
if
i
<
len
(
p
.
states
)
-
1
{
fmt
.
Fprint
(
p
.
w
,
"
\n
"
)
}
...
...
runner/llamarunner/runner.go
View file @
20c5fd39
...
...
@@ -583,9 +583,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
PenaltyRepeat
:
req
.
Options
.
RepeatPenalty
,
PenaltyFreq
:
req
.
Options
.
FrequencyPenalty
,
PenaltyPresent
:
req
.
Options
.
PresencePenalty
,
Mirostat
:
req
.
Options
.
Mirostat
,
MirostatTau
:
req
.
Options
.
MirostatTau
,
MirostatEta
:
req
.
Options
.
MirostatEta
,
Seed
:
uint32
(
req
.
Options
.
Seed
),
Grammar
:
req
.
Grammar
,
}
...
...
@@ -820,7 +817,6 @@ func Execute(args []string) error {
threads
:=
fs
.
Int
(
"threads"
,
runtime
.
NumCPU
(),
"Number of threads to use during generation"
)
verbose
:=
fs
.
Bool
(
"verbose"
,
false
,
"verbose output (default: disabled)"
)
noMmap
:=
fs
.
Bool
(
"no-mmap"
,
false
,
"do not memory-map model (slower load but may reduce pageouts if not using mlock)"
)
mlock
:=
fs
.
Bool
(
"mlock"
,
false
,
"force system to keep model in RAM rather than swapping or compressing"
)
tensorSplit
:=
fs
.
String
(
"tensor-split"
,
""
,
"fraction of the model to offload to each GPU, comma-separated list of proportions"
)
multiUserCache
:=
fs
.
Bool
(
"multiuser-cache"
,
false
,
"optimize input cache algorithm for multiple users"
)
...
...
@@ -876,7 +872,6 @@ func Execute(args []string) error {
NumGpuLayers
:
*
nGpuLayers
,
MainGpu
:
*
mainGpu
,
UseMmap
:
!*
noMmap
&&
lpaths
.
String
()
==
""
,
UseMlock
:
*
mlock
,
TensorSplit
:
tensorSplitFloats
,
Progress
:
func
(
progress
float32
)
{
server
.
progress
=
progress
...
...
runner/ollamarunner/cache.go
View file @
20c5fd39
...
...
@@ -284,7 +284,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
copy
(
newInputs
[
numKeep
:
],
slot
.
Inputs
[
numKeep
+
discard
:
])
// Reset the cache
_
=
c
.
cache
.
Remove
(
slot
.
Id
,
0
,
-
1
)
_
=
c
.
cache
.
Remove
(
slot
.
Id
,
0
,
math
.
MaxInt32
)
slot
.
Inputs
=
[]
input
.
Input
{}
// Return error with inputs that need to be reprocessed
...
...
runner/ollamarunner/runner.go
View file @
20c5fd39
...
...
@@ -34,14 +34,10 @@ import (
_
"github.com/ollama/ollama/model/models"
)
type
contextList
struct
{
list
[]
ml
.
Context
}
type
Sequence
struct
{
// ctxs are used for allocating tensors that last the lifetime of the sequence, such as
// multimodal embeddings
ctxs
*
c
ontext
List
ctxs
[]
ml
.
C
ontext
// batch index
iBatch
int
...
...
@@ -177,8 +173,10 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
// inputs processes the prompt and images into a list of inputs
// by splitting the prompt on [img-<n>] tags, tokenizing text and
// decoding images
func
(
s
*
Server
)
inputs
(
prompt
string
,
images
[]
llm
.
ImageData
)
([]
input
.
Input
,
*
c
ontext
List
,
error
)
{
func
(
s
*
Server
)
inputs
(
prompt
string
,
images
[]
llm
.
ImageData
)
([]
input
.
Input
,
[]
ml
.
C
ontext
,
error
)
{
var
inputs
[]
input
.
Input
var
ctxs
[]
ml
.
Context
var
parts
[]
string
var
matches
[][]
string
...
...
@@ -192,13 +190,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *
parts
=
[]
string
{
prompt
}
}
var
contexts
contextList
runtime
.
AddCleanup
(
&
contexts
,
func
(
ctxs
[]
ml
.
Context
)
{
for
_
,
ctx
:=
range
ctxs
{
ctx
.
Close
()
}
},
contexts
.
list
)
postTokenize
:=
false
for
i
,
part
:=
range
parts
{
// text - tokenize
...
...
@@ -228,7 +219,8 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *
}
ctx
:=
s
.
model
.
Backend
()
.
NewContext
()
contexts
.
list
=
append
(
contexts
.
list
,
ctx
)
runtime
.
SetFinalizer
(
ctx
,
func
(
c
ml
.
Context
)
{
c
.
Close
()
})
ctxs
=
append
(
ctxs
,
ctx
)
imageEmbeddings
,
err
:=
multimodalProcessor
.
EncodeMultimodal
(
ctx
,
images
[
imageIndex
]
.
Data
)
if
err
!=
nil
{
return
nil
,
nil
,
err
...
...
@@ -251,7 +243,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *
}
}
return
inputs
,
&
context
s
,
nil
return
inputs
,
ctx
s
,
nil
}
type
Server
struct
{
...
...
@@ -826,7 +818,6 @@ func Execute(args []string) error {
threads
:=
fs
.
Int
(
"threads"
,
runtime
.
NumCPU
(),
"Number of threads to use during generation"
)
verbose
:=
fs
.
Bool
(
"verbose"
,
false
,
"verbose output (default: disabled)"
)
_
=
fs
.
Bool
(
"no-mmap"
,
false
,
"do not memory-map model (slower load but may reduce pageouts if not using mlock)"
)
_
=
fs
.
Bool
(
"mlock"
,
false
,
"force system to keep model in RAM rather than swapping or compressing"
)
tensorSplit
:=
fs
.
String
(
"tensor-split"
,
""
,
"fraction of the model to offload to each GPU, comma-separated list of proportions"
)
multiUserCache
:=
fs
.
Bool
(
"multiuser-cache"
,
false
,
"optimize input cache algorithm for multiple users"
)
...
...
scripts/build_windows.ps1
View file @
20c5fd39
...
...
@@ -27,7 +27,6 @@ function checkEnv() {
$
env
:
VCToolsRedistDir
=
(
get-item
"
${MSVC_INSTALL}
\VC\Redist\MSVC\*"
)[
0
]
}
# Locate CUDA versions
# Note: this assumes every version found will be built
$cudaList
=
(
get-item
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\"
-ea
'silentlycontinue'
)
if
(
$cudaList
.
length
-eq
0
)
{
$d
=
(
get-command
-ea
'silentlycontinue'
nvcc
)
.
path
...
...
@@ -94,19 +93,6 @@ function buildOllama() {
$hashEnv
=
@{}
Get-ChildItem
env:
|
foreach
{
$hashEnv
[
$_
.
Name
]
=
$_
.
Value
}
if
(
"
$
script
:
CUDA_DIRS
"
.
Contains
(
"v11"
))
{
$hashEnv
.
Keys
|
foreach
{
if
(
$_
.
Contains
(
"CUDA_PATH_V11"
))
{
$v11
=
"
$_
"
}}
$
env
:
CUDAToolkit_ROOT
=
$hashEnv
[
$v11
]
write-host
"Building CUDA v11 backend libraries"
# Note: cuda v11 requires msvc 2019 so force the older generator
# to avoid 2022 (or newer) from being used as the default
&
cmake
--fresh
--preset
"CUDA 11"
-G
"Visual Studio 16 2019"
--install-prefix
$
script
:
DIST_DIR
if
(
$LASTEXITCODE
-ne
0
)
{
exit
(
$LASTEXITCODE
)}
&
cmake
--build
--preset
"CUDA 11"
--config
Release
--parallel
$
script
:
JOBS
if
(
$LASTEXITCODE
-ne
0
)
{
exit
(
$LASTEXITCODE
)}
&
cmake
--install
build
--component
"CUDA"
--strip
if
(
$LASTEXITCODE
-ne
0
)
{
exit
(
$LASTEXITCODE
)}
}
if
(
"
$
script
:
CUDA_DIRS
"
.
Contains
(
"v12"
))
{
$hashEnv
.
Keys
|
foreach
{
if
(
$_
.
Contains
(
"CUDA_PATH_V12"
))
{
$v12
=
"
$_
"
}}
$
env
:
CUDAToolkit_ROOT
=
$hashEnv
[
$v12
]
...
...
@@ -121,7 +107,7 @@ function buildOllama() {
if
(
$
env
:
HIP_PATH
)
{
write-host
"Building ROCm backend libraries"
if
(
-Not
(
get-command
-ErrorAction
silent
ninja
))
{
$NINJA_DIR
=
(
gci
-path
(
Get-CimInstance
MSFT_VSInstance
-Namespace
root/cimv2/vs
)[
0
]
.
InstallLocation
-r
-fi
ninja.exe
)
|
split-path
-parent
$NINJA_DIR
=
(
gci
-path
(
Get-CimInstance
MSFT_VSInstance
-Namespace
root/cimv2/vs
)[
0
]
.
InstallLocation
-r
-fi
ninja.exe
)
.
Directory
.
FullName
$
env
:
PATH
=
"
$NINJA_DIR
;
$
env
:
PATH
"
}
$
env
:
HIPCXX
=
"
${env:HIP_PATH}
\bin\clang++.exe"
...
...
scripts/env.sh
View file @
20c5fd39
...
...
@@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
--build-arg=GOFLAGS
\
--build-arg=OLLAMA_CUSTOM_CPU_DEFS
\
--build-arg=OLLAMA_SKIP_CUDA_GENERATE
\
--build-arg=OLLAMA_SKIP_CUDA_11_GENERATE
\
--build-arg=OLLAMA_SKIP_CUDA_12_GENERATE
\
--build-arg=CUDA_V11_ARCHITECTURES
\
--build-arg=CUDA_V12_ARCHITECTURES
\
--build-arg=OLLAMA_SKIP_ROCM_GENERATE
\
--build-arg=OLLAMA_FAST_BUILD
\
...
...
server/create.go
View file @
20c5fd39
...
...
@@ -15,6 +15,7 @@ import (
"path/filepath"
"slices"
"strings"
"sync/atomic"
"github.com/gin-gonic/gin"
...
...
@@ -23,7 +24,6 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
...
...
@@ -425,9 +425,14 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
func
quantizeLayer
(
layer
*
layerGGML
,
quantizeType
string
,
fn
func
(
resp
api
.
ProgressResponse
))
(
*
layerGGML
,
error
)
{
ft
:=
layer
.
GGML
.
KV
()
.
FileType
()
fn
(
api
.
ProgressResponse
{
Status
:
fmt
.
Sprintf
(
"quantizing %s model to %s"
,
ft
,
quantizeType
)})
want
,
err
:=
ggml
.
ParseFileType
(
quantizeType
)
var
doneBytes
atomic
.
Uint64
totalBytes
:=
uint64
(
layer
.
Size
)
-
layer
.
GGML
.
Tensors
()
.
Offset
fnWrap
:=
func
(
n
uint64
)
{
done
:=
doneBytes
.
Add
(
n
)
progress
:=
float32
(
done
)
/
float32
(
totalBytes
)
fn
(
api
.
ProgressResponse
{
Status
:
fmt
.
Sprintf
(
"quantizing %s model to %s"
,
ft
,
quantizeType
),
Digest
:
"0"
,
Total
:
layer
.
Size
,
Completed
:
int64
(
progress
*
float32
(
layer
.
Size
))})
}
ftype
,
err
:=
ggml
.
ParseFileType
(
quantizeType
)
if
err
!=
nil
{
return
nil
,
err
}
...
...
@@ -436,6 +441,11 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
if
err
!=
nil
{
return
nil
,
err
}
fp
,
err
:=
os
.
Open
(
blob
)
if
err
!=
nil
{
return
nil
,
err
}
defer
fp
.
Close
()
temp
,
err
:=
os
.
CreateTemp
(
filepath
.
Dir
(
blob
),
quantizeType
)
if
err
!=
nil
{
...
...
@@ -444,15 +454,15 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
defer
temp
.
Close
()
defer
os
.
Remove
(
temp
.
Name
())
if
err
:=
llama
.
Q
uantize
(
blob
,
temp
.
Name
(),
uint32
(
want
)
);
err
!=
nil
{
if
err
:=
q
uantize
(
fp
,
temp
,
layer
.
GGML
,
ftype
,
fnWrap
);
err
!=
nil
{
return
nil
,
err
}
temp
.
Seek
(
0
,
io
.
SeekStart
)
fn
(
api
.
ProgressResponse
{
Status
:
"verifying conversion"
})
newLayer
,
err
:=
NewLayer
(
temp
,
layer
.
MediaType
)
if
err
!=
nil
{
return
nil
,
err
}
if
_
,
err
:=
temp
.
Seek
(
0
,
io
.
SeekStart
);
err
!=
nil
{
return
nil
,
err
}
...
...
@@ -462,7 +472,6 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
slog
.
Error
(
fmt
.
Sprintf
(
"error decoding ggml: %s
\n
"
,
err
))
return
nil
,
err
}
return
&
layerGGML
{
newLayer
,
f
},
nil
}
...
...
server/images.go
View file @
20c5fd39
...
...
@@ -106,6 +106,11 @@ func (m *Model) Capabilities() []model.Capability {
capabilities
=
append
(
capabilities
,
model
.
CapabilityInsert
)
}
// Check for vision capability in projector-based models
if
len
(
m
.
ProjectorPaths
)
>
0
{
capabilities
=
append
(
capabilities
,
model
.
CapabilityVision
)
}
return
capabilities
}
...
...
Prev
1
…
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment