Unverified Commit 20c5fd39 authored by Devon Rifkin's avatar Devon Rifkin Committed by GitHub
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents d2ee599d 6e9a7a25
...@@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV { ...@@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
return kv return kv
} }
func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
var out []ggml.Tensor var out []*ggml.Tensor
for _, t := range ts { for _, t := range ts {
if !strings.HasPrefix(t.Name(), "v.") { if !strings.HasPrefix(t.Name(), "v.") {
...@@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { ...@@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
} }
} }
out = append(out, ggml.Tensor{ out = append(out, &ggml.Tensor{
Name: t.Name(), Name: t.Name(),
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
......
...@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV { ...@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
return kv return kv
} }
func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
oldnew := []string{ oldnew := []string{
"model.layers", "blk", "model.layers", "blk",
"w1", "ffn_gate_exps", "w1", "ffn_gate_exps",
...@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { ...@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
return true return true
}) })
var out []ggml.Tensor var out []*ggml.Tensor
for n, e := range experts { for n, e := range experts {
// TODO(mxyng): sanity check experts // TODO(mxyng): sanity check experts
out = append(out, ggml.Tensor{ out = append(out, &ggml.Tensor{
Name: n, Name: n,
Kind: e[0].Kind(), Kind: e[0].Kind(),
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...), Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
......
...@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV { ...@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
return kv return kv
} }
func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
var addRopeFactors sync.Once var addRopeFactors sync.Once
out := make([]ggml.Tensor, 0, len(ts)+2) out := make([]*ggml.Tensor, 0, len(ts)+2)
for _, t := range ts { for _, t := range ts {
if strings.HasPrefix(t.Name(), "blk.0.") { if strings.HasPrefix(t.Name(), "blk.0.") {
addRopeFactors.Do(func() { addRopeFactors.Do(func() {
out = append(out, ggml.Tensor{ out = append(out, &ggml.Tensor{
Name: "rope_factors_long.weight", Name: "rope_factors_long.weight",
Kind: 0, Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
WriterTo: p.RopeScaling.LongFactor, WriterTo: p.RopeScaling.LongFactor,
}, ggml.Tensor{ }, &ggml.Tensor{
Name: "rope_factors_short.weight", Name: "rope_factors_short.weight",
Kind: 0, Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
...@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { ...@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
}) })
} }
out = append(out, ggml.Tensor{ out = append(out, &ggml.Tensor{
Name: t.Name(), Name: t.Name(),
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
......
...@@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV { ...@@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
return kv return kv
} }
func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor { func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor {
var out []ggml.Tensor var out []*ggml.Tensor
for _, t := range ts { for _, t := range ts {
out = append(out, ggml.Tensor{ out = append(out, &ggml.Tensor{
Name: t.Name(), Name: t.Name(),
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
......
...@@ -130,6 +130,7 @@ func TestConvertModel(t *testing.T) { ...@@ -130,6 +130,7 @@ func TestConvertModel(t *testing.T) {
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
defer expectFile.Close()
var expect map[string]string var expect map[string]string
if err := json.NewDecoder(expectFile).Decode(&expect); err != nil { if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
......
package convert
import (
"archive/zip"
"errors"
"io"
"io/fs"
"os"
"path/filepath"
)
type ZipReader struct {
r *zip.Reader
p string
// limit is the maximum size of a file that can be read directly
// from the zip archive. Files larger than this size will be extracted
limit int64
}
func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
return &ZipReader{r, p, limit}
}
func (z *ZipReader) Open(name string) (fs.File, error) {
r, err := z.r.Open(name)
if err != nil {
return nil, err
}
defer r.Close()
if fi, err := r.Stat(); err != nil {
return nil, err
} else if fi.Size() < z.limit {
return r, nil
}
if !filepath.IsLocal(name) {
return nil, zip.ErrInsecurePath
}
n := filepath.Join(z.p, name)
if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
w, err := os.Create(n)
if err != nil {
return nil, err
}
defer w.Close()
if _, err := io.Copy(w, r); err != nil {
return nil, err
}
} else if err != nil {
return nil, err
}
return os.Open(n)
}
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
package discover package discover
import ( import (
"fmt"
"log/slog" "log/slog"
"os" "os"
"regexp" "regexp"
...@@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string { ...@@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers // driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) { if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
// The detected driver is older than Feb 2023
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
return "v11" return "v11"
} }
return "v12" return "v12"
......
...@@ -27,12 +27,14 @@ ...@@ -27,12 +27,14 @@
#endif #endif
#ifndef LOG
#define LOG(verbose, ...) \ #define LOG(verbose, ...) \
do { \ do { \
if (verbose) { \ if (verbose) { \
fprintf(stderr, __VA_ARGS__); \ fprintf(stderr, __VA_ARGS__); \
} \ } \
} while (0) } while (0)
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
......
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h> #include <string.h>
#include <inttypes.h>
#include "gpu_info_cudart.h" #include "gpu_info_cudart.h"
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
...@@ -58,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { ...@@ -58,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret); LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) {
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
return; return;
} }
...@@ -168,9 +169,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) { ...@@ -168,9 +169,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
resp->free = memInfo.free; resp->free = memInfo.free;
resp->used = memInfo.used; resp->used = memInfo.used;
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total); LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total);
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free); LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free);
LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used); LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
} }
......
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h> #include <string.h>
#include <inttypes.h>
#include "gpu_info_nvcuda.h" #include "gpu_info_nvcuda.h"
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
...@@ -193,8 +194,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { ...@@ -193,8 +194,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->total = memInfo.total; resp->total = memInfo.total;
resp->free = memInfo.free; resp->free = memInfo.free;
LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024); LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024); LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
......
...@@ -12,7 +12,7 @@ import ( ...@@ -12,7 +12,7 @@ import (
// '../lib/ollama' on Linux and the executable's directory on macOS // '../lib/ollama' on Linux and the executable's directory on macOS
// note: distribution builds, additional GPU-specific libraries are // note: distribution builds, additional GPU-specific libraries are
// found in subdirectories of the returned path, such as // found in subdirectories of the returned path, such as
// 'cuda_v11', 'cuda_v12', 'rocm', etc. // 'cuda_v12', 'rocm', etc.
var LibOllamaPath string = func() string { var LibOllamaPath string = func() string {
exe, err := os.Executable() exe, err := os.Executable()
if err != nil { if err != nil {
......
...@@ -394,9 +394,6 @@ curl http://localhost:11434/api/generate -d '{ ...@@ -394,9 +394,6 @@ curl http://localhost:11434/api/generate -d '{
"repeat_penalty": 1.2, "repeat_penalty": 1.2,
"presence_penalty": 1.5, "presence_penalty": 1.5,
"frequency_penalty": 1.0, "frequency_penalty": 1.0,
"mirostat": 1,
"mirostat_tau": 0.8,
"mirostat_eta": 0.6,
"penalize_newline": true, "penalize_newline": true,
"stop": ["\n", "user:"], "stop": ["\n", "user:"],
"numa": false, "numa": false,
...@@ -404,10 +401,7 @@ curl http://localhost:11434/api/generate -d '{ ...@@ -404,10 +401,7 @@ curl http://localhost:11434/api/generate -d '{
"num_batch": 2, "num_batch": 2,
"num_gpu": 1, "num_gpu": 1,
"main_gpu": 0, "main_gpu": 0,
"low_vram": false,
"vocab_only": false,
"use_mmap": true, "use_mmap": true,
"use_mlock": false,
"num_thread": 8 "num_thread": 8
} }
}' }'
......
...@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md). ...@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size? ## How can I specify the context window size?
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. By default, Ollama uses a context window size of 4096 tokens.
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
...@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve ...@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
To change this when using `ollama run`, use `/set parameter`: To change this when using `ollama run`, use `/set parameter`:
```shell ```shell
/set parameter num_ctx 8192 /set parameter num_ctx 4096
``` ```
When using the API, specify the `num_ctx` parameter: When using the API, specify the `num_ctx` parameter:
...@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{ ...@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
"model": "llama3.2", "model": "llama3.2",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"options": { "options": {
"num_ctx": 8192 "num_ctx": 4096
} }
}' }'
``` ```
......
# GPU # GPU
## Nvidia ## Nvidia
Ollama supports Nvidia GPUs with compute capability 5.0+. Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
Check your compute compatibility to see if your card is supported: Check your compute compatibility to see if your card is supported:
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
......
...@@ -150,9 +150,6 @@ PARAMETER <parameter> <parametervalue> ...@@ -150,9 +150,6 @@ PARAMETER <parameter> <parametervalue>
| Parameter | Description | Value Type | Example Usage | | Parameter | Description | Value Type | Example Usage |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 |
| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 | | num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 | | repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 | | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
......
...@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto ...@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
In the server log, you will see a message that looks something like this (varies from release to release): In the server log, you will see a message that looks something like this (varies from release to release):
``` ```
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5] Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
``` ```
**Experimental LLM Library Override** **Experimental LLM Library Override**
......
...@@ -169,7 +169,7 @@ var ( ...@@ -169,7 +169,7 @@ var (
// Enable the new Ollama engine // Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE") NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length // ContextLength sets the default context length
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1) ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
) )
func String(s string) func() string { func String(s string) func() string {
...@@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 { ...@@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
} }
} }
func Int64(key string, defaultValue int64) func() int64 {
return func() int64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
// Set aside VRAM per GPU // Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
...@@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar { ...@@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"}, "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
// Informational // Informational
......
...@@ -278,9 +278,9 @@ func TestVar(t *testing.T) { ...@@ -278,9 +278,9 @@ func TestVar(t *testing.T) {
} }
func TestContextLength(t *testing.T) { func TestContextLength(t *testing.T) {
cases := map[string]int64{ cases := map[string]uint{
"": -1, "": 4096,
"4096": 4096, "2048": 2048,
} }
for k, v := range cases { for k, v := range cases {
......
...@@ -37,12 +37,12 @@ func (kv KV) ParameterCount() uint64 { ...@@ -37,12 +37,12 @@ func (kv KV) ParameterCount() uint64 {
return val return val
} }
func (kv KV) FileType() fileType { func (kv KV) FileType() FileType {
if t := kv.Uint("general.file_type"); t > 0 { if t := kv.Uint("general.file_type"); t > 0 {
return fileType(t) return FileType(t)
} }
return fileTypeUnknown return FileTypeUnknown
} }
func (kv KV) BlockCount() uint64 { func (kv KV) BlockCount() uint64 {
...@@ -194,7 +194,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue .. ...@@ -194,7 +194,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ..
return val, true return val, true
} }
slog.Warn("key with type not found", "key", key, "default", defaultValue[0]) slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
return defaultValue[0], false return defaultValue[0], false
} }
...@@ -271,7 +271,11 @@ func (t Tensor) block() (n int) { ...@@ -271,7 +271,11 @@ func (t Tensor) block() (n int) {
} }
func (t Tensor) blockSize() uint64 { func (t Tensor) blockSize() uint64 {
switch t.Kind { return (TensorType)(t.Kind).BlockSize()
}
func (t TensorType) BlockSize() uint64 {
switch t {
case case
0, // F32 0, // F32
1, // F16 1, // F16
...@@ -297,73 +301,77 @@ func (t Tensor) blockSize() uint64 { ...@@ -297,73 +301,77 @@ func (t Tensor) blockSize() uint64 {
} }
func (t Tensor) typeSize() uint64 { func (t Tensor) typeSize() uint64 {
blockSize := t.blockSize() return TensorType(t.Kind).TypeSize()
}
func (t TensorType) TypeSize() uint64 {
blockSize := t.BlockSize()
switch t.Kind { switch t {
case 0: // FP32 case TensorTypeF32:
return 4 return 4
case 1: // FP16 case TensorTypeF16:
return 2 return 2
case 2: // Q4_0 case TensorTypeQ4_0:
return 2 + blockSize/2 return 2 + blockSize/2
case 3: // Q4_1 case TensorTypeQ4_1:
return 2 + 2 + blockSize/2 return 2 + 2 + blockSize/2
case 6: // Q5_0 case TensorTypeQ5_0:
return 2 + 4 + blockSize/2 return 2 + 4 + blockSize/2
case 7: // Q5_1 case TensorTypeQ5_1:
return 2 + 2 + 4 + blockSize/2 return 2 + 2 + 4 + blockSize/2
case 8: // Q8_0 case TensorTypeQ8_0:
return 2 + blockSize return 2 + blockSize
case 9: // Q8_1 case TensorTypeQ8_1:
return 2 + 2 + blockSize return 2 + 2 + blockSize
case 10: // Q2_K case TensorTypeQ2_K:
return blockSize/16 + blockSize/4 + 2 + 2 return blockSize/16 + blockSize/4 + 2 + 2
case 11: // Q3_K case TensorTypeQ3_K:
return blockSize/8 + blockSize/4 + 12 + 2 return blockSize/8 + blockSize/4 + 12 + 2
case 12: // Q4_K case TensorTypeQ4_K:
return 2 + 2 + 12 + blockSize/2 return 2 + 2 + 12 + blockSize/2
case 13: // Q5_K case TensorTypeQ5_K:
return 2 + 2 + 12 + blockSize/8 + blockSize/2 return 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K case TensorTypeQ6_K:
return blockSize/2 + blockSize/4 + blockSize/16 + 2 return blockSize/2 + blockSize/4 + blockSize/16 + 2
case 15: // Q8_K case TensorTypeQ8_K:
return 4 + blockSize + 2*blockSize/16 return 4 + blockSize + 2*blockSize/16
case 16: // IQ2_XXS case tensorTypeIQ2_XXS:
return 2 + 2*blockSize/8 return 2 + 2*blockSize/8
case 17: // IQ2_XS case tensorTypeIQ2_XS:
return 2 + 2*blockSize/8 + blockSize/32 return 2 + 2*blockSize/8 + blockSize/32
case 18: // IQ3_XXS case tensorTypeIQ3_XXS:
return 2 + blockSize/4 + blockSize/8 return 2 + blockSize/4 + blockSize/8
case 19: // IQ1_S case tensorTypeIQ1_S:
return 2 + blockSize/8 + blockSize/16 return 2 + blockSize/8 + blockSize/16
case 20: // IQ4_NL case tensorTypeIQ4_NL:
return 2 + blockSize/2 return 2 + blockSize/2
case 21: // IQ3_S case tensorTypeIQ3_S:
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4 return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
case 22: // IQ2_S case tensorTypeIQ2_S:
return 2 + blockSize/4 + blockSize/16 return 2 + blockSize/4 + blockSize/16
case 23: // IQ4_XS case tensorTypeIQ4_XS:
return 2 + 2 + blockSize/2 + blockSize/64 return 2 + 2 + blockSize/2 + blockSize/64
case 24: // I8 case TensorTypeI8:
return 1 return 1
case 25: // I16 case TensorTypeI16:
return 2 return 2
case 26: // I32 case TensorTypeI32:
return 4 return 4
case 27: // I64 case TensorTypeI64:
return 8 return 8
case 28: // F64 case TensorTypeF64:
return 8 return 8
case 29: // IQ1_M case tensorTypeIQ1_M:
return blockSize/8 + blockSize/16 + blockSize/32 return blockSize/8 + blockSize/16 + blockSize/32
case 30: // BF16 case TensorTypeBF16:
return 2 return 2
default: default:
return 0 return 0
} }
} }
func (t Tensor) parameters() uint64 { func (t Tensor) Elements() uint64 {
var count uint64 = 1 var count uint64 = 1
for _, n := range t.Shape { for _, n := range t.Shape {
count *= n count *= n
...@@ -372,11 +380,11 @@ func (t Tensor) parameters() uint64 { ...@@ -372,11 +380,11 @@ func (t Tensor) parameters() uint64 {
} }
func (t Tensor) Size() uint64 { func (t Tensor) Size() uint64 {
return t.parameters() * t.typeSize() / t.blockSize() return t.Elements() * t.typeSize() / t.blockSize()
} }
func (t Tensor) Type() string { func (t Tensor) Type() string {
return fileType(t.Kind).String() return TensorType(t.Kind).String()
} }
type container interface { type container interface {
...@@ -525,7 +533,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri ...@@ -525,7 +533,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
var ropeFreqsCount uint64 var ropeFreqsCount uint64
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
ropeFreqsCount = ropeFreqsWeights.parameters() ropeFreqsCount = ropeFreqsWeights.Elements()
} }
} }
......
...@@ -9,8 +9,12 @@ import ( ...@@ -9,8 +9,12 @@ import (
"io" "io"
"log/slog" "log/slog"
"maps" "maps"
"os"
"runtime"
"slices" "slices"
"strings" "strings"
"golang.org/x/sync/errgroup"
) )
type containerGGUF struct { type containerGGUF struct {
...@@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { ...@@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
} }
llm.tensors = append(llm.tensors, &tensor) llm.tensors = append(llm.tensors, &tensor)
llm.parameters += tensor.parameters() llm.parameters += tensor.Elements()
} }
// patch KV with parameter count // patch KV with parameter count
...@@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error { ...@@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
return err return err
} }
if t == ggufTypeString {
for _, e := range any(s).([]string) {
if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil {
return err
}
}
return nil
}
return binary.Write(w, binary.LittleEndian, s) return binary.Write(w, binary.LittleEndian, s)
} }
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
alignment := kv.Uint("general.alignment", 32) alignment := kv.Uint("general.alignment", 32)
if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil { if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
return err return err
} }
if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil { if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
return err return err
} }
if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil { if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil {
return err return err
} }
if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil { if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
return err return err
} }
...@@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { ...@@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
slices.Sort(keys) slices.Sort(keys)
for _, key := range keys { for _, key := range keys {
if err := ggufWriteKV(ws, key, kv[key]); err != nil { if err := ggufWriteKV(f, key, kv[key]); err != nil {
return err return err
} }
} }
slices.SortStableFunc(ts, func(a, b Tensor) int { slices.SortStableFunc(ts, func(a, b *Tensor) int {
if i, j := a.block(), b.block(); i < 0 && j > 0 { if i, j := a.block(), b.block(); i < 0 && j > 0 {
return 1 return 1
} else if i > 0 && j < 0 { } else if i > 0 && j < 0 {
...@@ -530,21 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { ...@@ -530,21 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
}) })
var s uint64 var s uint64
for _, t := range ts { for i := range ts {
t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment))) ts[i].Offset = s
if err := ggufWriteTensorInfo(ws, t); err != nil { if err := ggufWriteTensorInfo(f, ts[i]); err != nil {
return err return err
} }
s += t.Size() s += ts[i].Size()
s += uint64(ggufPadding(int64(s), int64(alignment)))
} }
for _, t := range ts { offset, err := f.Seek(0, io.SeekCurrent)
if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil { if err != nil {
return err return err
} }
offset += ggufPadding(offset, int64(alignment))
var g errgroup.Group
g.SetLimit(runtime.GOMAXPROCS(0))
// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
for _, t := range ts {
t := t
w := io.NewOffsetWriter(f, offset+int64(t.Offset))
g.Go(func() error {
_, err := t.WriteTo(w)
return err
})
} }
return nil return g.Wait()
} }
func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
...@@ -559,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { ...@@ -559,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
var err error var err error
switch v := v.(type) { switch v := v.(type) {
case uint32: case uint32, FileType:
err = writeGGUF(ws, ggufTypeUint32, v) err = writeGGUF(ws, ggufTypeUint32, v)
case uint64:
err = writeGGUF(ws, ggufTypeUint64, v)
case float32: case float32:
err = writeGGUF(ws, ggufTypeFloat32, v) err = writeGGUF(ws, ggufTypeFloat32, v)
case bool: case bool:
...@@ -569,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { ...@@ -569,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
err = writeGGUFString(ws, v) err = writeGGUFString(ws, v)
case []int32: case []int32:
err = writeGGUFArray(ws, ggufTypeInt32, v) err = writeGGUFArray(ws, ggufTypeInt32, v)
case *array[int32]:
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
case []uint32: case []uint32:
err = writeGGUFArray(ws, ggufTypeUint32, v) err = writeGGUFArray(ws, ggufTypeUint32, v)
case *array[uint32]:
err = writeGGUFArray(ws, ggufTypeUint32, v.values)
case []float32: case []float32:
err = writeGGUFArray(ws, ggufTypeFloat32, v) err = writeGGUFArray(ws, ggufTypeFloat32, v)
case *array[float32]:
err = writeGGUFArray(ws, ggufTypeFloat32, v.values)
case []string: case []string:
if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil { err = writeGGUFArray(ws, ggufTypeString, v)
return err case *array[string]:
} err = writeGGUFArray(ws, ggufTypeString, v.values)
if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
return err
}
if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
return err
}
for _, e := range v {
if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
return err
}
if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
return err
}
}
default: default:
return fmt.Errorf("improper type for '%s'", k) return fmt.Errorf("improper type for '%s'", k)
} }
...@@ -602,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { ...@@ -602,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
return err return err
} }
func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset) slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil { if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
return err return err
...@@ -629,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { ...@@ -629,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
return binary.Write(ws, binary.LittleEndian, t.Offset) return binary.Write(ws, binary.LittleEndian, t.Offset)
} }
func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
offset, err := ws.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
return err
}
_, err = t.WriteTo(ws)
return err
}
func ggufPadding(offset, align int64) int64 { func ggufPadding(offset, align int64) int64 {
return (align - offset%align) % align return (align - offset%align) % align
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment