Unverified Commit 20c5fd39 authored by Devon Rifkin's avatar Devon Rifkin Committed by GitHub
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents d2ee599d 6e9a7a25
package ggml
import (
"bytes"
"os"
"slices"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestWriteGGUF(t *testing.T) {
w, err := os.CreateTemp(t.TempDir(), "*.bin")
if err != nil {
t.Fatal(err)
}
defer w.Close()
if err := WriteGGUF(w, KV{
"general.alignment": uint32(16),
}, []*Tensor{
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
}); err != nil {
t.Fatal(err)
}
r, err := os.Open(w.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
ff, _, err := Decode(r, 0)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(ff.KV(), KV{
"general.alignment": uint32(16),
"general.parameter_count": uint64(36),
}); diff != "" {
t.Errorf("Mismatch (-want +got):\n%s", diff)
}
if diff := cmp.Diff(ff.Tensors(), Tensors{
Offset: 336,
items: []*Tensor{
{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
},
}, cmp.AllowUnexported(Tensors{})); diff != "" {
t.Errorf("Mismatch (-want +got):\n%s", diff)
}
}
package ggml package ggml
import "fmt" import (
"fmt"
"log/slog"
"strings"
)
type fileType uint32 // FileType is the Go equivalent to llama_ftype used for gguf file typing
type FileType uint32
const ( const (
fileTypeF32 fileType = iota FileTypeF32 FileType = iota
fileTypeF16 FileTypeF16
fileTypeQ4_0 FileTypeQ4_0
fileTypeQ4_1 FileTypeQ4_1
fileTypeQ4_1_F16 fileTypeQ4_1_F16 // unused by GGML
fileTypeQ4_2 // unused fileTypeQ4_2 // unused by GGML
fileTypeQ4_3 // unused fileTypeQ4_3 // unused by GGML
fileTypeQ8_0 FileTypeQ8_0
fileTypeQ5_0 FileTypeQ5_0
fileTypeQ5_1 FileTypeQ5_1
fileTypeQ2_K FileTypeQ2_K
fileTypeQ3_K_S FileTypeQ3_K_S
fileTypeQ3_K_M FileTypeQ3_K_M
fileTypeQ3_K_L FileTypeQ3_K_L
fileTypeQ4_K_S FileTypeQ4_K_S
fileTypeQ4_K_M FileTypeQ4_K_M
fileTypeQ5_K_S FileTypeQ5_K_S
fileTypeQ5_K_M FileTypeQ5_K_M
fileTypeQ6_K FileTypeQ6_K
fileTypeIQ2_XXS fileTypeIQ2_XXS // not supported by ollama
fileTypeIQ2_XS fileTypeIQ2_XS // not supported by ollama
fileTypeQ2_K_S FileTypeQ2_K_S
fileTypeIQ3_XS fileTypeIQ3_XS // not supported by ollama
fileTypeIQ3_XXS fileTypeIQ3_XXS // not supported by ollama
fileTypeIQ1_S fileTypeIQ1_S // not supported by ollama
fileTypeIQ4_NL fileTypeIQ4_NL // not supported by ollama
fileTypeIQ3_S fileTypeIQ3_S // not supported by ollama
fileTypeIQ3_M fileTypeIQ3_M // not supported by ollama
fileTypeIQ2_S fileTypeIQ2_S // not supported by ollama
fileTypeIQ2_M fileTypeIQ2_M // not supported by ollama
fileTypeIQ4_XS fileTypeIQ4_XS // not supported by ollama
fileTypeIQ1_M fileTypeIQ1_M // not supported by ollama
fileTypeBF16 FileTypeBF16
fileTypeQ4_0_4_4 // unused by GGML
fileTypeQ4_0_4_8 // unused by GGML
fileTypeQ4_0_8_8 // unused by GGML
fileTypeTQ1_0 // not supported by ollama
fileTypeTQ2_0 // not supported by ollama
fileTypeUnknown FileTypeUnknown = 1024
) )
func ParseFileType(s string) (fileType, error) { // ParseFileType parses the provided GGUF file type
// Only Ollama supported types are considered valid
func ParseFileType(s string) (FileType, error) {
switch s { switch s {
case "F32": case "F32":
return fileTypeF32, nil return FileTypeF32, nil
case "F16": case "F16":
return fileTypeF16, nil return FileTypeF16, nil
case "Q4_0": case "Q4_0":
return fileTypeQ4_0, nil return FileTypeQ4_0, nil
case "Q4_1": case "Q4_1":
return fileTypeQ4_1, nil return FileTypeQ4_1, nil
case "Q4_1_F16":
return fileTypeQ4_1_F16, nil
case "Q8_0": case "Q8_0":
return fileTypeQ8_0, nil return FileTypeQ8_0, nil
case "Q5_0": case "Q5_0":
return fileTypeQ5_0, nil return FileTypeQ5_0, nil
case "Q5_1": case "Q5_1":
return fileTypeQ5_1, nil return FileTypeQ5_1, nil
case "Q2_K": case "Q2_K":
return fileTypeQ2_K, nil return FileTypeQ2_K, nil
case "Q3_K_S": case "Q3_K_S":
return fileTypeQ3_K_S, nil return FileTypeQ3_K_S, nil
case "Q3_K_M": case "Q3_K_M":
return fileTypeQ3_K_M, nil return FileTypeQ3_K_M, nil
case "Q3_K_L": case "Q3_K_L":
return fileTypeQ3_K_L, nil return FileTypeQ3_K_L, nil
case "Q4_K_S": case "Q4_K_S":
return fileTypeQ4_K_S, nil return FileTypeQ4_K_S, nil
case "Q4_K_M": case "Q4_K_M", "Q4_K":
return fileTypeQ4_K_M, nil return FileTypeQ4_K_M, nil
case "Q5_K_S": case "Q5_K_S":
return fileTypeQ5_K_S, nil return FileTypeQ5_K_S, nil
case "Q5_K_M": case "Q5_K_M", "Q5_K":
return fileTypeQ5_K_M, nil return FileTypeQ5_K_M, nil
case "Q6_K": case "Q6_K":
return fileTypeQ6_K, nil return FileTypeQ6_K, nil
case "IQ2_XXS":
return fileTypeIQ2_XXS, nil
case "IQ2_XS":
return fileTypeIQ2_XS, nil
case "Q2_K_S": case "Q2_K_S":
return fileTypeQ2_K_S, nil return FileTypeQ2_K_S, nil
case "IQ3_XS":
return fileTypeIQ3_XS, nil
case "IQ3_XXS":
return fileTypeIQ3_XXS, nil
case "IQ1_S":
return fileTypeIQ1_S, nil
case "IQ4_NL":
return fileTypeIQ4_NL, nil
case "IQ3_S":
return fileTypeIQ3_S, nil
case "IQ3_M":
return fileTypeIQ3_M, nil
case "IQ2_S":
return fileTypeIQ2_S, nil
case "IQ2_M":
return fileTypeIQ2_M, nil
case "IQ4_XS":
return fileTypeIQ4_XS, nil
case "IQ1_M":
return fileTypeIQ1_M, nil
case "BF16": case "BF16":
return fileTypeBF16, nil return FileTypeBF16, nil
default: default:
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s) supportedFileTypes := []FileType{
FileTypeF32,
FileTypeF16,
FileTypeQ4_K_S,
FileTypeQ4_K_M,
FileTypeQ8_0,
// fsggml.FileTypeBF16, // TODO
}
strs := make([]string, len(supportedFileTypes))
for i := range supportedFileTypes {
strs[i] = supportedFileTypes[i].String()
}
return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", "))
} }
} }
func (t fileType) String() string { func (t FileType) String() string {
switch t { switch t {
case fileTypeF32: case FileTypeF32:
return "F32" return "F32"
case fileTypeF16: case FileTypeF16:
return "F16" return "F16"
case fileTypeQ4_0: case FileTypeQ4_0:
return "Q4_0" return "Q4_0"
case fileTypeQ4_1: case FileTypeQ4_1:
return "Q4_1" return "Q4_1"
case fileTypeQ4_1_F16: case FileTypeQ8_0:
return "Q4_1_F16"
case fileTypeQ8_0:
return "Q8_0" return "Q8_0"
case fileTypeQ5_0: case FileTypeQ5_0:
return "Q5_0" return "Q5_0"
case fileTypeQ5_1: case FileTypeQ5_1:
return "Q5_1" return "Q5_1"
case fileTypeQ2_K: case FileTypeQ2_K:
return "Q2_K" return "Q2_K"
case fileTypeQ3_K_S: case FileTypeQ3_K_S:
return "Q3_K_S" return "Q3_K_S"
case fileTypeQ3_K_M: case FileTypeQ3_K_M:
return "Q3_K_M" return "Q3_K_M"
case fileTypeQ3_K_L: case FileTypeQ3_K_L:
return "Q3_K_L" return "Q3_K_L"
case fileTypeQ4_K_S: case FileTypeQ4_K_S:
return "Q4_K_S" return "Q4_K_S"
case fileTypeQ4_K_M: case FileTypeQ4_K_M:
return "Q4_K_M" return "Q4_K_M"
case fileTypeQ5_K_S: case FileTypeQ5_K_S:
return "Q5_K_S" return "Q5_K_S"
case fileTypeQ5_K_M: case FileTypeQ5_K_M:
return "Q5_K_M" return "Q5_K_M"
case fileTypeQ6_K: case FileTypeQ6_K:
return "Q6_K" return "Q6_K"
case fileTypeIQ2_XXS: case FileTypeQ2_K_S:
return "IQ2_XXS"
case fileTypeIQ2_XS:
return "IQ2_XS"
case fileTypeQ2_K_S:
return "Q2_K_S" return "Q2_K_S"
case fileTypeIQ3_XS: case FileTypeBF16:
return "IQ3_XS"
case fileTypeIQ3_XXS:
return "IQ3_XXS"
case fileTypeIQ1_S:
return "IQ1_S"
case fileTypeIQ4_NL:
return "IQ4_NL"
case fileTypeIQ3_S:
return "IQ3_S"
case fileTypeIQ3_M:
return "IQ3_M"
case fileTypeIQ2_S:
return "IQ2_S"
case fileTypeIQ4_XS:
return "IQ4_XS"
case fileTypeIQ2_M:
return "IQ2_M"
case fileTypeIQ1_M:
return "IQ1_M"
case fileTypeBF16:
return "BF16" return "BF16"
default: default:
return "unknown" return "unknown"
} }
} }
func (t fileType) Value() uint32 { func (t FileType) Value() uint32 {
return uint32(t) return uint32(t)
} }
func (ftype FileType) ToTensorType() TensorType {
switch ftype {
case FileTypeF32:
return TensorTypeF32
case FileTypeF16:
return TensorTypeF16
case FileTypeQ4_0:
return TensorTypeQ4_0
case FileTypeQ4_1:
return TensorTypeQ4_1
case FileTypeQ8_0:
return TensorTypeQ8_0
case FileTypeQ5_0:
return TensorTypeQ5_0
case FileTypeQ5_1:
return TensorTypeQ5_1
case FileTypeQ2_K:
return TensorTypeQ2_K
case FileTypeQ3_K_S:
return TensorTypeQ3_K
case FileTypeQ3_K_M:
return TensorTypeQ3_K
case FileTypeQ3_K_L:
return TensorTypeQ3_K
case FileTypeQ4_K_S:
return TensorTypeQ4_K
case FileTypeQ4_K_M:
return TensorTypeQ4_K
case FileTypeQ5_K_S:
return TensorTypeQ5_K
case FileTypeQ5_K_M:
return TensorTypeQ5_K
case FileTypeQ6_K:
return TensorTypeQ6_K
case FileTypeQ2_K_S:
return TensorTypeQ2_K
case FileTypeBF16:
return TensorTypeBF16
default:
slog.Warn("unsupported file type", "type", ftype)
return 0 // F32
}
}
// TensorType is equivalent to ggml_type for individual tensor types
// Note: these are not the same as FileType
type TensorType uint32
const (
TensorTypeF32 TensorType = iota
TensorTypeF16
TensorTypeQ4_0
TensorTypeQ4_1
tensorTypeQ4_2 // unused by GGML
tensorTypeQ4_3 // unused by GGML
TensorTypeQ5_0
TensorTypeQ5_1
TensorTypeQ8_0
TensorTypeQ8_1
TensorTypeQ2_K
TensorTypeQ3_K
TensorTypeQ4_K
TensorTypeQ5_K
TensorTypeQ6_K
TensorTypeQ8_K
tensorTypeIQ2_XXS // not supported by ollama
tensorTypeIQ2_XS // not supported by ollama
tensorTypeIQ3_XXS // not supported by ollama
tensorTypeIQ1_S // not supported by ollama
tensorTypeIQ4_NL // not supported by ollama
tensorTypeIQ3_S // not supported by ollama
tensorTypeIQ2_S // not supported by ollama
tensorTypeIQ4_XS // not supported by ollama
TensorTypeI8
TensorTypeI16
TensorTypeI32
TensorTypeI64
TensorTypeF64
tensorTypeIQ1_M // not supported by ollama
TensorTypeBF16
tensorTypeQ4_0_4_4 // unused by GGML
tensorTypeQ4_0_4_8 // unused by GGML
tensorTypeQ4_0_8_8 // unused by GGML
tensorTypeTQ1_0 // not supported by ollama
tensorTypeTQ2_0 // not supported by ollama
tensorTypeIQ4_NL_4_4 // unused by GGML
tensorTypeIQ4_NL_4_8 // unused by GGML
tensorTypeIQ4_NL_8_8 // unused by GGML
)
// ParseFileType parses the provided GGUF file type
// Only Ollama supported types are considered valid
func ParseTensorType(s string) (TensorType, error) {
switch s {
case "F32":
return TensorTypeF32, nil
case "F16":
return TensorTypeF16, nil
case "Q4_0":
return TensorTypeQ4_0, nil
case "Q4_1":
return TensorTypeQ4_1, nil
case "Q5_0":
return TensorTypeQ5_0, nil
case "Q5_1":
return TensorTypeQ5_1, nil
case "Q8_0":
return TensorTypeQ8_0, nil
case "Q8_1":
return TensorTypeQ8_1, nil
case "Q2_K":
return TensorTypeQ2_K, nil
case "Q3_K":
return TensorTypeQ3_K, nil
case "Q4_K":
return TensorTypeQ4_K, nil
case "Q5_K":
return TensorTypeQ5_K, nil
case "Q6_K":
return TensorTypeQ6_K, nil
case "Q8_K":
return TensorTypeQ8_K, nil
case "F64":
return TensorTypeF64, nil
case "BF16":
return TensorTypeBF16, nil
default:
return 0, fmt.Errorf("unsupported quantization type %s", s)
}
}
func (t TensorType) IsQuantized() bool {
switch t {
case TensorTypeF32, TensorTypeF16, TensorTypeBF16:
return false
default:
return true
}
}
func (t TensorType) RowSize(ne uint64) uint64 {
return t.TypeSize() * ne / t.BlockSize()
}
func (t TensorType) String() string {
switch t {
case TensorTypeF32:
return "F32"
case TensorTypeF16:
return "F16"
case TensorTypeQ4_0:
return "Q4_0"
case TensorTypeQ4_1:
return "Q4_1"
case TensorTypeQ5_0:
return "Q5_0"
case TensorTypeQ5_1:
return "Q5_1"
case TensorTypeQ8_0:
return "Q8_0"
case TensorTypeQ8_1:
return "Q8_1"
case TensorTypeQ2_K:
return "Q2_K"
case TensorTypeQ3_K:
return "Q3_K"
case TensorTypeQ4_K:
return "Q4_K"
case TensorTypeQ5_K:
return "Q5_K"
case TensorTypeQ6_K:
return "Q6_K"
case TensorTypeQ8_K:
return "Q8_K"
case TensorTypeF64:
return "F64"
case TensorTypeBF16:
return "BF16"
default:
return "unknown"
}
}
...@@ -11,7 +11,7 @@ require ( ...@@ -11,7 +11,7 @@ require (
github.com/spf13/cobra v1.7.0 github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.9.0 github.com/stretchr/testify v1.9.0
github.com/x448/float16 v0.8.4 github.com/x448/float16 v0.8.4
golang.org/x/sync v0.11.0 golang.org/x/sync v0.12.0
) )
require ( require (
...@@ -70,12 +70,12 @@ require ( ...@@ -70,12 +70,12 @@ require (
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.8.0 // indirect golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.33.0 golang.org/x/crypto v0.36.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
golang.org/x/net v0.35.0 // indirect golang.org/x/net v0.38.0 // indirect
golang.org/x/sys v0.30.0 golang.org/x/sys v0.31.0
golang.org/x/term v0.29.0 golang.org/x/term v0.30.0
golang.org/x/text v0.22.0 golang.org/x/text v0.23.0
google.golang.org/protobuf v1.34.1 google.golang.org/protobuf v1.34.1
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )
...@@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk ...@@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
...@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R ...@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
...@@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ ...@@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
...@@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc ...@@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
......
...@@ -34,13 +34,15 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V { ...@@ -34,13 +34,15 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
func TestAllMiniLMEmbeddings(t *testing.T) { func TestAllMiniLMEmbeddings(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbeddingRequest{ req := api.EmbeddingRequest{
Model: "all-minilm", Model: "all-minilm",
Prompt: "why is the sky blue?", Prompt: "why is the sky blue?",
} }
res, err := embeddingTestHelper(ctx, t, req) res, err := embeddingTestHelper(ctx, client, t, req)
if err != nil { if err != nil {
t.Fatalf("error: %v", err) t.Fatalf("error: %v", err)
...@@ -62,13 +64,15 @@ func TestAllMiniLMEmbeddings(t *testing.T) { ...@@ -62,13 +64,15 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
func TestAllMiniLMEmbed(t *testing.T) { func TestAllMiniLMEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{ req := api.EmbedRequest{
Model: "all-minilm", Model: "all-minilm",
Input: "why is the sky blue?", Input: "why is the sky blue?",
} }
res, err := embedTestHelper(ctx, t, req) res, err := embedTestHelper(ctx, client, t, req)
if err != nil { if err != nil {
t.Fatalf("error: %v", err) t.Fatalf("error: %v", err)
...@@ -98,13 +102,15 @@ func TestAllMiniLMEmbed(t *testing.T) { ...@@ -98,13 +102,15 @@ func TestAllMiniLMEmbed(t *testing.T) {
func TestAllMiniLMBatchEmbed(t *testing.T) { func TestAllMiniLMBatchEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{ req := api.EmbedRequest{
Model: "all-minilm", Model: "all-minilm",
Input: []string{"why is the sky blue?", "why is the grass green?"}, Input: []string{"why is the sky blue?", "why is the grass green?"},
} }
res, err := embedTestHelper(ctx, t, req) res, err := embedTestHelper(ctx, client, t, req)
if err != nil { if err != nil {
t.Fatalf("error: %v", err) t.Fatalf("error: %v", err)
...@@ -144,6 +150,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { ...@@ -144,6 +150,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
func TestAllMiniLMEmbedTruncate(t *testing.T) { func TestAllMiniLMEmbedTruncate(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
truncTrue, truncFalse := true, false truncTrue, truncFalse := true, false
...@@ -182,7 +190,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) { ...@@ -182,7 +190,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
res := make(map[string]*api.EmbedResponse) res := make(map[string]*api.EmbedResponse)
for _, req := range reqs { for _, req := range reqs {
response, err := embedTestHelper(ctx, t, req.Request) response, err := embedTestHelper(ctx, client, t, req.Request)
if err != nil { if err != nil {
t.Fatalf("error: %v", err) t.Fatalf("error: %v", err)
} }
...@@ -198,7 +206,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) { ...@@ -198,7 +206,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
} }
// check that truncate set to false returns an error if context length is exceeded // check that truncate set to false returns an error if context length is exceeded
_, err := embedTestHelper(ctx, t, api.EmbedRequest{ _, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
Model: "all-minilm", Model: "all-minilm",
Input: "why is the sky blue?", Input: "why is the sky blue?",
Truncate: &truncFalse, Truncate: &truncFalse,
...@@ -210,9 +218,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) { ...@@ -210,9 +218,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
} }
} }
func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) { func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, req.Model); err != nil { if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err) t.Fatalf("failed to pull model %s: %v", req.Model, err)
} }
...@@ -226,9 +232,7 @@ func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingReq ...@@ -226,9 +232,7 @@ func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingReq
return response, nil return response, nil
} }
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) { func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, req.Model); err != nil { if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err) t.Fatalf("failed to pull model %s: %v", req.Model, err)
} }
......
...@@ -48,17 +48,6 @@ var ( ...@@ -48,17 +48,6 @@ var (
} }
) )
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
deadline, hasDeadline := t.Deadline()
if !hasDeadline {
return 8 * time.Minute, 10 * time.Minute
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
t.Skip("too little time")
return time.Duration(0), time.Duration(0)
}
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
}
func TestModelsGenerate(t *testing.T) { func TestModelsGenerate(t *testing.T) {
softTimeout, hardTimeout := getTimeouts(t) softTimeout, hardTimeout := getTimeouts(t)
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
......
//go:build integration && models
package integration
import (
"bytes"
"context"
"fmt"
"log/slog"
"strings"
"testing"
"time"
"github.com/ollama/ollama/api"
)
func TestQuantization(t *testing.T) {
sourceModels := []string{
"qwen2.5:0.5b-instruct-fp16",
}
quantizations := []string{
"Q8_0",
"Q4_K_S",
"Q4_K_M",
"Q4_K",
}
softTimeout, hardTimeout := getTimeouts(t)
started := time.Now()
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
for _, base := range sourceModels {
if err := PullIfMissing(ctx, client, base); err != nil {
t.Fatalf("pull failed %s", err)
}
for _, quant := range quantizations {
newName := fmt.Sprintf("%s__%s", base, quant)
t.Run(newName, func(t *testing.T) {
if time.Now().Sub(started) > softTimeout {
t.Skip("skipping remaining tests to avoid excessive runtime")
}
req := &api.CreateRequest{
Model: newName,
Quantization: quant,
From: base,
}
fn := func(resp api.ProgressResponse) error {
// fmt.Print(".")
return nil
}
t.Logf("quantizing: %s -> %s", base, quant)
if err := client.Create(ctx, req, fn); err != nil {
t.Fatalf("create failed %s", err)
}
defer func() {
req := &api.DeleteRequest{
Model: newName,
}
t.Logf("deleting: %s -> %s", base, quant)
if err := client.Delete(ctx, req); err != nil {
t.Logf("failed to clean up %s: %s", req.Model, err)
}
}()
// Check metadata on the model
resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
if err != nil {
t.Fatalf("unable to show model: %s", err)
}
if !strings.Contains(resp.Details.QuantizationLevel, quant) {
t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
}
stream := true
genReq := api.GenerateRequest{
Model: newName,
Prompt: "why is the sky blue?",
KeepAlive: &api.Duration{Duration: 3 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
Stream: &stream,
}
t.Logf("verifying: %s -> %s", base, quant)
// Some smaller quantizations can cause models to have poor quality
// or get stuck in repetition loops, so we stop as soon as we have any matches
anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
reqCtx, reqCancel := context.WithCancel(ctx)
atLeastOne := false
var buf bytes.Buffer
genfn := func(response api.GenerateResponse) error {
buf.Write([]byte(response.Response))
fullResp := strings.ToLower(buf.String())
for _, resp := range anyResp {
if strings.Contains(fullResp, resp) {
atLeastOne = true
t.Log(fullResp)
reqCancel()
break
}
}
return nil
}
done := make(chan int)
var genErr error
go func() {
genErr = client.Generate(reqCtx, &genReq, genfn)
done <- 0
}()
select {
case <-done:
if genErr != nil && !atLeastOne {
t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
}
case <-ctx.Done():
t.Error("outer test context done while waiting for generate")
}
t.Logf("passed")
})
}
}
}
...@@ -217,6 +217,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin ...@@ -217,6 +217,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err) slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
return return
} }
defer fp.Close()
data, err := io.ReadAll(fp) data, err := io.ReadAll(fp)
if err != nil { if err != nil {
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err) slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
...@@ -358,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) { ...@@ -358,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
} }
} }
} }
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
deadline, hasDeadline := t.Deadline()
if !hasDeadline {
return 8 * time.Minute, 10 * time.Minute
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
t.Skip("too little time")
return time.Duration(0), time.Duration(0)
}
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
}
...@@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) { ...@@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) {
} }
} }
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells)) return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
} }
func (c *Causal) updateSlidingWindow() { func (c *Causal) updateSlidingWindow() {
......
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac"; char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";
...@@ -342,6 +342,8 @@ struct common_params { ...@@ -342,6 +342,8 @@ struct common_params {
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
struct common_params_model mmproj; struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding // embedding
......
...@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json; ...@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max(); auto has_max = max_items != std::numeric_limits<int>::max();
if (max_items == 0) {
return "";
}
if (min_items == 0 && max_items == 1) { if (min_items == 0 && max_items == 1) {
return item_rule + "?"; return item_rule + "?";
} }
......
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
#include "gguf.h" #include "gguf.h"
#include "clip.h" #include "clip.h"
#include "clip.h"
#include <climits> #include <climits>
#include <cstdarg> #include <cstdarg>
#include <string> #include <string>
...@@ -17,33 +15,31 @@ ...@@ -17,33 +15,31 @@
#define KEY_FTYPE "general.file_type" #define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name" #define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description" #define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu" #define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_EMBD "clip.vision.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_FF "clip.vision.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count" #define KEY_N_BLOCK "clip.vision.block_count"
#define KEY_N_HEAD "clip.%s.attention.head_count" #define KEY_N_HEAD "clip.vision.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" #define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim" #define KEY_PROJ_DIM "clip.vision.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer" #define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
// //
...@@ -60,7 +56,9 @@ ...@@ -60,7 +56,9 @@
#define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" #define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s" #define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_PRE "%s.pre_ln.%s"
...@@ -72,6 +70,8 @@ ...@@ -72,6 +70,8 @@
#define TN_IMAGE_NEWLINE "model.image_newline" #define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
// mimicpmv // mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
...@@ -87,18 +87,19 @@ ...@@ -87,18 +87,19 @@
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type { enum projector_type {
PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_MINICPMV,
PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER, PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
...@@ -106,10 +107,13 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { ...@@ -106,10 +107,13 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"}, { PROJECTOR_TYPE_MINICPMV, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#include <array> #include <array>
#include <numeric>
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
...@@ -172,14 +173,18 @@ struct clip_hparams { ...@@ -172,14 +173,18 @@ struct clip_hparams {
int32_t projection_dim; int32_t projection_dim;
int32_t n_head; int32_t n_head;
int32_t n_layer; int32_t n_layer;
int32_t proj_scale_factor = 0; // idefics3
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
float eps; float eps = 1e-6;
float rope_theta = 0.0;
std::vector<int32_t> image_grid_pinpoints; std::vector<int32_t> image_grid_pinpoints;
int32_t image_crop_resolution; int32_t image_crop_resolution;
std::unordered_set<int32_t> vision_feature_layer; std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
}; };
struct clip_layer { struct clip_layer {
...@@ -199,11 +204,20 @@ struct clip_layer { ...@@ -199,11 +204,20 @@ struct clip_layer {
struct ggml_tensor * ln_1_b = nullptr; struct ggml_tensor * ln_1_b = nullptr;
// ff // ff
struct ggml_tensor * ff_i_w = nullptr; struct ggml_tensor * ff_i_w = nullptr; // legacy naming
struct ggml_tensor * ff_i_b = nullptr; struct ggml_tensor * ff_i_b = nullptr; // legacy naming
struct ggml_tensor * ff_o_w = nullptr; // legacy naming
struct ggml_tensor * ff_o_b = nullptr; // legacy naming
struct ggml_tensor * ff_o_w = nullptr; struct ggml_tensor * ff_up_w = nullptr;
struct ggml_tensor * ff_o_b = nullptr; struct ggml_tensor * ff_up_b = nullptr;
struct ggml_tensor * ff_gate_w = nullptr;
struct ggml_tensor * ff_gate_b = nullptr;
struct ggml_tensor * ff_down_w = nullptr;
struct ggml_tensor * ff_down_b = nullptr;
struct ggml_tensor * ff_g_w = NULL;
struct ggml_tensor * ff_g_b = NULL;
// layernorm 2 // layernorm 2
struct ggml_tensor * ln_2_w = nullptr; struct ggml_tensor * ln_2_w = nullptr;
...@@ -249,8 +263,6 @@ struct clip_vision_model { ...@@ -249,8 +263,6 @@ struct clip_vision_model {
//GLMV-Edge projection //GLMV-Edge projection
struct ggml_tensor * mm_model_adapter_conv_w = nullptr; struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
struct ggml_tensor * mm_model_adapter_conv_b = nullptr; struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
struct ggml_tensor * boi_w = nullptr;
struct ggml_tensor * eoi_w = nullptr;
// MobileVLM projection // MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w = nullptr; struct ggml_tensor * mm_model_mlp_1_w = nullptr;
...@@ -309,16 +321,14 @@ struct clip_vision_model { ...@@ -309,16 +321,14 @@ struct clip_vision_model {
// gemma3 // gemma3
struct ggml_tensor * mm_input_proj_w = nullptr; struct ggml_tensor * mm_input_proj_w = nullptr;
struct ggml_tensor * mm_soft_emb_norm_w = nullptr; struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
// pixtral
struct ggml_tensor * token_embd_img_break = nullptr;
}; };
struct clip_ctx { struct clip_ctx {
bool has_text_encoder = false;
bool has_vision_encoder = false;
bool has_llava_projector = false; bool has_llava_projector = false;
bool has_minicpmv_projector = false; int minicpmv_version = 0;
bool has_glm_projector = false;
bool has_qwen2vl_merger = false;
int minicpmv_version = 2;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP; projector_type proj_type = PROJECTOR_TYPE_MLP;
...@@ -341,6 +351,7 @@ struct clip_ctx { ...@@ -341,6 +351,7 @@ struct clip_ctx {
ggml_backend_t backend_cpu; ggml_backend_t backend_cpu;
ggml_backend_buffer_ptr buf; ggml_backend_buffer_ptr buf;
int max_nodes = 8192;
ggml_backend_sched_ptr sched; ggml_backend_sched_ptr sched;
clip_image_size load_image_size; clip_image_size load_image_size;
...@@ -376,23 +387,20 @@ struct clip_ctx { ...@@ -376,23 +387,20 @@ struct clip_ctx {
} }
}; };
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
const auto & model = ctx->vision_model; const auto & model = ctx->vision_model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int image_size = hparams.image_size; int image_size_width = img.nx;
int image_size_width = image_size; int image_size_height = img.ny;
int image_size_height = image_size;
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
const float eps = hparams.eps; const float eps = hparams.eps;
GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(), /*.mem_size =*/ ctx->buf_compute_meta.size(),
...@@ -519,6 +527,35 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im ...@@ -519,6 +527,35 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
embeddings = ggml_mul_mat(ctx0, embeddings = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
embeddings); embeddings);
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
ggml_tensor * cur = embeddings;
const int scale_factor = model.hparams.proj_scale_factor;
const int n_embd = cur->ne[0];
const int seq = cur->ne[1];
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = std::sqrt(seq);
const int width = std::sqrt(seq);
GGML_ASSERT(scale_factor != 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
seq / (scale_factor * scale_factor),
bsz);
cur = ggml_mul_mat(ctx0, model.projection, cur);
embeddings = cur;
} else {
GGML_ABORT("SigLIP: Unsupported projector type");
} }
// build the graph // build the graph
...@@ -527,19 +564,462 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im ...@@ -527,19 +564,462 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
return gf; return gf;
} }
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { // implementation of the 2D RoPE without adding a new op in ggml
if (!ctx->has_vision_encoder) { // this is not efficient (use double the memory), but works on all backends
LOG_ERR("This gguf file seems to have no vision encoder\n"); // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
return nullptr; static ggml_tensor * build_rope_2d(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * pos_h,
ggml_tensor * pos_w,
const float freq_base
) {
const int64_t n_dim = cur->ne[0];
const int64_t n_head = cur->ne[1];
const int64_t n_pos = cur->ne[2];
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
// first half of cur will use 1e-0, 1e-2 (even)
// second half of cur will use 1e-1, 1e-3 (odd)
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
// then for the second half, we use freq_scale to shift the inv_freq
// ^ why? replace (2i) with (2i+1) in the above equation
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
// first half
ggml_tensor * first;
{
first = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
ggml_row_size(cur->type, n_dim),
ggml_row_size(cur->type, n_dim*n_head),
0);
first = ggml_rope_ext(
ctx0,
first,
pos_h, // positions
nullptr, // freq factors
n_dim/2, // n_dims
0, 0, freq_base,
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
);
}
// second half
ggml_tensor * second;
{
second = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
ggml_row_size(cur->type, n_dim),
ggml_row_size(cur->type, n_dim*n_head),
n_dim/2 * ggml_element_size(cur));
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
second = ggml_rope_ext(
ctx0,
second,
pos_w, // positions
nullptr, // freq factors
n_dim/2, // n_dims
0, 0, freq_base,
freq_scale_odd,
0.0f, 1.0f, 0.0f, 0.0f
);
}
cur = ggml_concat(ctx0, first, second, 0);
return cur;
}
static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
int image_size_width = img.nx;
int image_size_height = img.ny;
const int patch_size = hparams.patch_size;
const int n_patches_x = image_size_width / patch_size;
const int n_patches_y = image_size_height / patch_size;
const int num_patches = n_patches_x * n_patches_y;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx0_ptr(ggml_init(params));
auto ctx0 = ctx0_ptr.get();
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// input raw
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
// 2D input positions
struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
struct ggml_tensor * embeddings = inp;
// pre-layer norm
embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
// loop over layers
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings;
// pre-attention norm
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
// self-attention
{
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, embeddings);
embeddings = cur; // embeddings = residual, cur = hidden_states
// pre-ffn norm
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
// feed-forward
{
ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
cur = ggml_mul(ctx0, up_proj, gate_proj);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
}
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// LlavaMultiModalProjector (with GELU activation)
{
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
// arrangement of the [IMG_BREAK] token
{
// not efficient, but works
// the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
// after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
const int n_embd_text = embeddings->ne[0];
const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
cur = ggml_concat(ctx0, cur, tok, 1);
embeddings = ggml_view_2d(ctx0, cur,
n_embd_text, n_tokens_output,
ggml_row_size(cur->type, n_embd_text), 0);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}
static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size_width = imgs.entries[0]->nx;
const int image_size_height = imgs.entries[0]->ny;
const bool use_window_attn = hparams.n_wa_pattern > 0;
const int n_wa_pattern = hparams.n_wa_pattern;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs.entries.size();
GGML_ASSERT(batch_size == 1);
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx0_ptr(ggml_init(params));
auto ctx0 = ctx0_ptr.get();
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
if (model.patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
struct ggml_tensor * embeddings = inp;
struct ggml_tensor * window_mask = nullptr;
struct ggml_tensor * window_idx = nullptr;
struct ggml_tensor * inv_window_idx = nullptr;
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
}
if (use_window_attn) {
// handle window attention inputs
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
// rmsnorm1
cur = ggml_rms_norm(ctx0, cur, eps);
cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
// self-attention
{
struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
Q = ggml_rope_multi(
ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
struct ggml_tensor * K =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_rope_multi(
ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
struct ggml_tensor * V =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
if (full_attn) {
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
} else {
KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
}
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
}
// attention output
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, embeddings);
embeddings = cur; // embeddings = residual, cur = hidden_states
// rms norm2
cur = ggml_rms_norm(ctx0, cur, eps);
cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
// mlp
// ffn_up
auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
// TODO : only 2 of these 3 are actually used, should we remove one of them?
if (ctx->use_gelu) {
cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
} else if (ctx->use_silu) {
cur_gate = ggml_silu_inplace(ctx0, cur_gate);
} else {
cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
}
cur = ggml_mul(ctx0, cur_gate, cur_up);
// ffn_down
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// post-layernorm
if (model.post_ln_w) {
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
} }
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
if (use_window_attn) {
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
const auto & model = ctx->vision_model; const auto & model = ctx->vision_model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
if (ctx->has_minicpmv_projector) {
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height); LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
image_size_width = load_image_size.width; image_size_width = load_image_size.width;
image_size_height = load_image_size.height; image_size_height = load_image_size.height;
...@@ -548,7 +1028,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -548,7 +1028,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
image_size_height = imgs.entries[0]->ny; image_size_height = imgs.entries[0]->ny;
} }
} }
else if (ctx->has_qwen2vl_merger) {
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
// use the image's native resolution when image is avaible // use the image's native resolution when image is avaible
if (is_inf) { if (is_inf) {
// if (imgs->data->nx && imgs->data->ny) { // if (imgs->data->nx && imgs->data->ny) {
...@@ -556,12 +1037,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -556,12 +1037,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
image_size_height = imgs.entries[0]->ny; image_size_height = imgs.entries[0]->ny;
} }
} }
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size; const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size; const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0); const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
...@@ -570,7 +1052,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -570,7 +1052,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
const int batch_size = imgs.entries.size(); const int batch_size = imgs.entries.size();
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { if (ctx->has_llava_projector
|| ctx->proj_type == PROJECTOR_TYPE_MINICPMV
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
GGML_ASSERT(batch_size == 1); GGML_ASSERT(batch_size == 1);
} }
...@@ -591,8 +1075,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -591,8 +1075,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
if (ctx->has_qwen2vl_merger) { if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
GGML_ASSERT(image_size_width % (patch_size * 2) == 0); GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0); GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
...@@ -621,40 +1105,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -621,40 +1105,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
struct ggml_tensor * embeddings = inp; struct ggml_tensor * embeddings = inp;
struct ggml_tensor * pos_embed = nullptr; struct ggml_tensor * pos_embed = nullptr;
if (ctx->has_llava_projector) { // concat class_embeddings and patch_embeddings
// concat class_embeddings and patch_embeddings if (model.class_embedding) {
if (model.class_embedding) { embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
ggml_set_name(embeddings, "embeddings"); embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
ggml_set_input(embeddings); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
} }
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
ggml_set_input(positions); ggml_set_input(positions);
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
embeddings = embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
} }
if (ctx->has_minicpmv_projector) { if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
int pos_w = image_size_width/patch_size; int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size; int pos_h = image_size_height/patch_size;
if (ctx->minicpmv_version == 2) { int n_output_dim = clip_n_mmproj_embd(ctx);
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 4) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
ggml_set_name(pos_embed, "pos_embed"); ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed); ggml_set_input(pos_embed);
} }
...@@ -697,7 +1171,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -697,7 +1171,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) { if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
Q = ggml_rope_multi( Q = ggml_rope_multi(
ctx0, Q, positions, nullptr, ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
...@@ -709,7 +1183,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -709,7 +1183,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) { if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
K = ggml_rope_multi( K = ggml_rope_multi(
ctx0, K, positions, nullptr, ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
...@@ -974,106 +1448,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -974,106 +1448,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
} }
} }
// minicpmv projector // minicpmv projector
else if (ctx->has_minicpmv_projector) else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
{ struct ggml_tensor * q = model.mm_model_query;
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { { // layernorm
struct ggml_tensor * q = model.mm_model_query; q = ggml_norm(ctx0, q, eps);
{ // layernorm q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
q = ggml_norm(ctx0, q, eps); }
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
{ // layernorm
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
}
struct ggml_tensor * k;
{ // position
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
k = ggml_add(ctx0, v, pos_embed);
}
{ // attention
int hidden_size = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = hidden_size/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
num_query = 96;
} }
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); else if (ctx->minicpmv_version == 3) {
{ // layernorm num_query = 64;
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
} }
struct ggml_tensor * k; else if (ctx->minicpmv_version == 4) {
{ // position num_query = 64;
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
k = ggml_add(ctx0, v, pos_embed);
} }
{ // attention struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
int hidden_size = 4096; struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
const int d_head = 128; struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
int n_head = hidden_size/d_head; // permute
int num_query = 96; Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
if (ctx->minicpmv_version == 2) { Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
hidden_size = 4096; Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
n_head = hidden_size/d_head; K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
num_query = 96; K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
} K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
else if (ctx->minicpmv_version == 3) { V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
hidden_size = 3584; V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
n_head = hidden_size/d_head; V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
num_query = 64; struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
} KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
else if (ctx->minicpmv_version == 4) { struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
hidden_size = 3584; KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
n_head = hidden_size/d_head; KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
num_query = 64; KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
// permute
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
}
{ // layernorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
}
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
} }
else { { // layernorm
GGML_ASSERT(false); embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
} }
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
} }
// glm projector // glm projector
else if (ctx->has_glm_projector) { else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
size_t gridsz = (size_t)sqrt(embeddings->ne[1]); embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); // GLU
//GLU {
{ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); embeddings = ggml_gelu_inplace(ctx0, embeddings);
embeddings = ggml_gelu_inplace(ctx0, embeddings); struct ggml_tensor * x = embeddings;
struct ggml_tensor * x = embeddings; embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); embeddings = ggml_silu_inplace(ctx0, embeddings);
embeddings = ggml_silu_inplace(ctx0, embeddings); embeddings = ggml_mul(ctx0, embeddings,x);
embeddings = ggml_mul(ctx0, embeddings,x); embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
}
} else {
GGML_ABORT("fatal error");
} }
} }
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
...@@ -1094,12 +1554,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ...@@ -1094,12 +1554,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
} }
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { ggml_cgraph * res;
return clip_image_build_graph_siglip(ctx, imgs); switch (ctx->proj_type) {
} else { case PROJECTOR_TYPE_GEMMA3:
// TODO: we should have one build_* function per model case PROJECTOR_TYPE_IDEFICS3:
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); {
GGML_ASSERT(imgs.entries.size() == 1);
res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
GGML_ASSERT(imgs.entries.size() == 1);
res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
} break;
case PROJECTOR_TYPE_QWEN25VL:
{
res = clip_image_build_graph_qwen25vl(ctx, imgs);
} break;
default:
{
// TODO: we should have one build_* function per model
res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
} break;
} }
return res;
} }
struct clip_model_loader { struct clip_model_loader {
...@@ -1109,7 +1587,7 @@ struct clip_model_loader { ...@@ -1109,7 +1587,7 @@ struct clip_model_loader {
clip_ctx & ctx_clip; clip_ctx & ctx_clip;
std::string fname; std::string fname;
size_t model_size; // in bytes size_t model_size = 0; // in bytes
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) { clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
...@@ -1160,9 +1638,11 @@ struct clip_model_loader { ...@@ -1160,9 +1638,11 @@ struct clip_model_loader {
} }
void load_hparams() { void load_hparams() {
auto & hparams = ctx_clip.vision_model.hparams;
// projector type // projector type
std::string proj_type;
{ {
std::string proj_type;
get_string(KEY_PROJ_TYPE, proj_type, false); get_string(KEY_PROJ_TYPE, proj_type, false);
if (!proj_type.empty()) { if (!proj_type.empty()) {
ctx_clip.proj_type = clip_projector_type_from_string(proj_type); ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
...@@ -1174,34 +1654,27 @@ struct clip_model_loader { ...@@ -1174,34 +1654,27 @@ struct clip_model_loader {
// other hparams // other hparams
{ {
get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false);
get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false);
GGML_ASSERT(ctx_clip.has_vision_encoder);
GGML_ASSERT(!ctx_clip.has_text_encoder);
// legacy keys, use KEY_PROJ_TYPE instead
get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false);
get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false);
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false);
get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false);
// !!! do NOT extend the list above, use KEY_PROJ_TYPE instead
get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false); get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
get_bool(KEY_USE_SILU, ctx_clip.use_silu, false); get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
auto & hparams = ctx_clip.vision_model.hparams; get_u32(KEY_N_EMBD, hparams.hidden_size);
get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); get_u32(KEY_N_HEAD, hparams.n_head);
get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); get_u32(KEY_N_FF, hparams.n_intermediate);
get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); get_u32(KEY_N_BLOCK, hparams.n_layer);
get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer); get_u32(KEY_PROJ_DIM, hparams.projection_dim);
get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim); get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps); get_u32(KEY_IMAGE_SIZE, hparams.image_size);
get_u32(KEY_IMAGE_SIZE, hparams.image_size); get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false); get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
{ {
std::string mm_patch_merge_type; std::string mm_patch_merge_type;
get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
...@@ -1234,15 +1707,62 @@ struct clip_model_loader { ...@@ -1234,15 +1707,62 @@ struct clip_model_loader {
for (auto & layer : vision_feature_layer) { for (auto & layer : vision_feature_layer) {
hparams.vision_feature_layer.insert(layer); hparams.vision_feature_layer.insert(layer);
} }
// Calculate the deepest feature layer based on hparams and projector type // Calculate the deepest feature layer based on hparams and projector type
ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip); // NOTE: This is only used by build_graph_legacy()
{
// Get the index of the second to last layer; this is the default for models that have a llava projector
int n_layer = hparams.n_layer - 1;
int deepest_feature_layer = -1;
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
n_layer += 1;
}
// If we set explicit vision feature layers, only go up to the deepest one
// NOTE: only used by granite-vision models for now
for (const auto & feature_layer : hparams.vision_feature_layer) {
if (feature_layer > deepest_feature_layer) {
deepest_feature_layer = feature_layer;
}
}
ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
}
// model-specific params
switch (ctx_clip.proj_type) {
case PROJECTOR_TYPE_MINICPMV:
{
if (ctx_clip.minicpmv_version == 0) {
ctx_clip.minicpmv_version = 2; // default to 2 if not set
}
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
hparams.rope_theta = 10000.0f;
} break;
case PROJECTOR_TYPE_QWEN25VL:
{
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
} break;
default:
break;
}
LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder); LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder); LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector); LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
} }
...@@ -1298,9 +1818,6 @@ struct clip_model_loader { ...@@ -1298,9 +1818,6 @@ struct clip_model_loader {
vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false); vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
if (vision_model.patch_embeddings_1 == nullptr) {
ctx_clip.has_qwen2vl_merger = false;
}
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false); vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
...@@ -1314,16 +1831,28 @@ struct clip_model_loader { ...@@ -1314,16 +1831,28 @@ struct clip_model_loader {
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight")); layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false); layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false); layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false); layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false); layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false); layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
layer.ff_o_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); // new naming
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false);
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
// legacy naming (the in and out is reversed! don't ask me why)
layer.ff_i_w = layer.ff_down_w;
layer.ff_o_w = layer.ff_up_w;
layer.ff_g_w = layer.ff_gate_w;
layer.ff_i_b = layer.ff_down_b;
layer.ff_o_b = layer.ff_up_b;
layer.ff_g_b = layer.ff_gate_b;
} }
switch (ctx_clip.proj_type) { switch (ctx_clip.proj_type) {
...@@ -1388,7 +1917,7 @@ struct clip_model_loader { ...@@ -1388,7 +1917,7 @@ struct clip_model_loader {
vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
} break; } break;
case PROJECTOR_TYPE_RESAMPLER: case PROJECTOR_TYPE_MINICPMV:
{ {
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
...@@ -1420,10 +1949,9 @@ struct clip_model_loader { ...@@ -1420,10 +1949,9 @@ struct clip_model_loader {
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
} break; } break;
case PROJECTOR_TYPE_MERGER: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
{ {
vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
...@@ -1435,6 +1963,19 @@ struct clip_model_loader { ...@@ -1435,6 +1963,19 @@ struct clip_model_loader {
vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break; } break;
case PROJECTOR_TYPE_IDEFICS3:
{
vision_model.projection = get_tensor(TN_MM_PROJECTOR);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
// [IMG_BREAK] token embedding
vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
} break;
default: default:
GGML_ASSERT(false && "unknown projector type"); GGML_ASSERT(false && "unknown projector type");
} }
...@@ -1503,18 +2044,17 @@ struct clip_model_loader { ...@@ -1503,18 +2044,17 @@ struct clip_model_loader {
} }
void alloc_compute_meta() { void alloc_compute_meta() {
ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
// create a fake batch // create a fake batch
clip_image_f32_batch batch; clip_image_f32_batch batch;
clip_image_f32_ptr img(clip_image_f32_init()); clip_image_f32_ptr img(clip_image_f32_init());
clip_image_size image_size; clip_image_size image_size;
image_size.width = clip_get_image_size(&ctx_clip); image_size.width = ctx_clip.vision_model.hparams.image_size;
image_size.height = clip_get_image_size(&ctx_clip); image_size.height = ctx_clip.vision_model.hparams.image_size;
int n_patches = clip_get_image_size(&ctx_clip) / image_size.width; img->nx = image_size.width;
img->nx = n_patches; img->ny = image_size.height;
img->ny = n_patches; img->buf.resize(image_size.width * image_size.height * 3);
img->buf.resize(n_patches * image_size.width * image_size.height * 3);
batch.entries.push_back(std::move(img)); batch.entries.push_back(std::move(img));
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false); ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
...@@ -1902,6 +2442,26 @@ struct image_manipulation { ...@@ -1902,6 +2442,26 @@ struct image_manipulation {
} }
} }
// calculate the size of the **resized** image, while preserving the aspect ratio
// the calculated size will be aligned to the nearest multiple of align_size
// if H or W size is larger than max_dimension, it will be resized to max_dimension
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
return {0, 0};
}
float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
static_cast<float>(max_dimension) / inp_size.height));
float target_width_f = static_cast<float>(inp_size.width) * scale;
float target_height_f = static_cast<float>(inp_size.height) * scale;
int aligned_width = GGML_PAD((int)target_width_f, align_size);
int aligned_height = GGML_PAD((int)target_height_f, align_size);
return {aligned_width, aligned_height};
}
private: private:
static inline int clip(int x, int lower, int upper) { static inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
...@@ -2194,11 +2754,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { ...@@ -2194,11 +2754,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found // res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
clip_image_size original_size{img->nx, img->ny}; clip_image_size original_size{img->nx, img->ny};
bool pad_to_square = true; bool pad_to_square = true;
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
...@@ -2219,7 +2774,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str ...@@ -2219,7 +2774,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
} }
return true; return true;
} }
else if (ctx->has_qwen2vl_merger) { else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
clip_image_u8 resized; clip_image_u8 resized;
auto patch_size = clip_get_patch_size(ctx) * 2; auto patch_size = clip_get_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size; int nx = ceil((float)img->nx / patch_size) * patch_size;
...@@ -2233,17 +2788,27 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str ...@@ -2233,17 +2788,27 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
return true; return true;
} }
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
clip_image_u8 resized_image; clip_image_u8 resized_image;
int sz = params.image_size; int sz = params.image_size;
image_manipulation::bicubic_resize(*img, resized_image, sz, sz); image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
clip_image_f32_ptr img_f32(clip_image_f32_init()); clip_image_f32_ptr img_f32(clip_image_f32_init());
//clip_image_save_to_bmp(resized_image, "resized.bmp"); //clip_image_save_to_bmp(resized_image, "resized.bmp");
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
res_imgs->entries.push_back(std::move(img_f32)); res_imgs->entries.push_back(std::move(img_f32));
return true; return true;
} }
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
clip_image_u8 resized_image;
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
res_imgs->entries.push_back(std::move(img_f32));
return true;
}
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
...@@ -2299,16 +2864,18 @@ void clip_free(clip_ctx * ctx) { ...@@ -2299,16 +2864,18 @@ void clip_free(clip_ctx * ctx) {
delete ctx; delete ctx;
} }
// deprecated
size_t clip_embd_nbytes(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
int extra_tokens = ctx->has_glm_projector ? 2 : 0; const int32_t nx = ctx->vision_model.hparams.image_size;
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); const int32_t ny = ctx->vision_model.hparams.image_size;
return clip_embd_nbytes_by_img(ctx, nx, ny);
} }
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
clip_image_f32 img; clip_image_f32 img;
img.nx = img_w; img.nx = img_w;
img.ny = img_h; img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }
int32_t clip_get_image_size(const struct clip_ctx * ctx) { int32_t clip_get_image_size(const struct clip_ctx * ctx) {
...@@ -2338,21 +2905,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { ...@@ -2338,21 +2905,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints.size(); return ctx->vision_model.hparams.image_grid_pinpoints.size();
} }
// deprecated
int clip_n_patches(const struct clip_ctx * ctx) { int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img; clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size; img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size; img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img); return clip_n_output_tokens(ctx, &img);
} }
// deprecated
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
return clip_n_output_tokens(ctx, img);
}
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
}
return n_total;
}
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
}
return 1;
}
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams; const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
n_patches /= 4; n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
if (ctx->minicpmv_version == 2) { if (ctx->minicpmv_version == 2) {
n_patches = 96; n_patches = 96;
} }
...@@ -2362,13 +2952,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i ...@@ -2362,13 +2952,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
else if (ctx->minicpmv_version == 4) { else if (ctx->minicpmv_version == 4) {
n_patches = 64; n_patches = 64;
} }
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { else {
GGML_ABORT("Unknown minicpmv version");
}
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
int patch_size = params.patch_size * 2; int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch; n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256; n_patches = 256;
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
n_patches /= ctx->vision_model.hparams.proj_scale_factor;
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
int n_patches_x = img->nx / params.patch_size;
int n_patches_y = img->ny / params.patch_size;
n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} }
return n_patches; return n_patches;
...@@ -2461,11 +3060,6 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co ...@@ -2461,11 +3060,6 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
} }
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
clip_image_f32_batch imgs; clip_image_f32_batch imgs;
clip_image_f32_ptr img_copy(clip_image_f32_init()); clip_image_f32_ptr img_copy(clip_image_f32_init());
*img_copy = *img; *img_copy = *img;
...@@ -2476,24 +3070,12 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 ...@@ -2476,24 +3070,12 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr; const clip_image_f32_batch & imgs = *imgs_c_ptr;
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
int batch_size = imgs.entries.size(); int batch_size = imgs.entries.size();
if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); // TODO: support multiple images if (ctx->has_llava_projector
} || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
if (ctx->has_minicpmv_projector) { || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
GGML_ASSERT(batch_size == 1);
}
if (ctx->has_glm_projector) {
GGML_ASSERT(batch_size == 1); GGML_ASSERT(batch_size == 1);
ggml_tensor * boi = ctx->vision_model.boi_w;
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
} }
// build the inference graph // build the inference graph
...@@ -2502,164 +3084,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2502,164 +3084,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
// set inputs // set inputs
const auto & model = ctx->vision_model; const auto & model = ctx->vision_model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int image_size = hparams.image_size; const int image_size_width = imgs.entries[0]->nx;
int image_size_width = image_size; const int image_size_height = imgs.entries[0]->ny;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
image_size_width = imgs.entries[0]->nx;
image_size_height = imgs.entries[0]->ny;
}
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding ? 1 : 0); const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int pos_w = ctx->load_image_size.width / patch_size; const int pos_w = ctx->load_image_size.width / patch_size;
const int pos_h = ctx->load_image_size.height / patch_size; const int pos_h = ctx->load_image_size.height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) {
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
if (inp == nullptr) {
GGML_ABORT("Failed to get tensor %s", name);
}
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
GGML_ABORT("Tensor %s is not an input tensor", name);
}
return inp;
};
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
};
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
};
// set input pixel values
{ {
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); size_t nelem = 0;
float * data = (float *)malloc(ggml_nbytes(inp_raw)); for (const auto & img : imgs.entries) {
nelem += img->nx * img->ny * 3;
}
std::vector<float> inp_raw(nelem);
// layout of data (note: the channel dim is unrolled to better visualize the layout):
//
// ┌──W──┐
// │ H │ channel = R
// ├─────┤ │
// │ H │ channel = G
// ├─────┤ │
// │ H │ channel = B
// └─────┘ │
// ──────┘ x B
for (size_t i = 0; i < imgs.entries.size(); i++) { for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx; const int nx = imgs.entries[i]->nx;
const int ny = imgs.entries[i]->ny; const int ny = imgs.entries[i]->ny;
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
GGML_ASSERT(nx == image_size && ny == image_size);
}
const int n = nx * ny; const int n = nx * ny;
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
for (int k = 0; k < 3; k++) { float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) { for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) { for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k]; size_t base_src = 3*(y * nx + x); // idx of the first channel
} size_t base_dst = y * nx + x; // idx of the first channel
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
} }
} }
} }
} }
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); set_input_f32("inp_raw", inp_raw);
free(data);
} }
if (ctx->has_minicpmv_projector) {
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
{
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
int embed_dim = 4096;
if (ctx->minicpmv_version == 2) {
embed_dim = 4096;
}
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
else if (ctx->minicpmv_version == 4) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); // set input per projector
for(int i=0;i < pos_w * pos_h; ++i){ switch (ctx->proj_type) {
for(int j=0; j < embed_dim; ++j){ case PROJECTOR_TYPE_MINICPMV:
pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j]; {
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
std::vector<int32_t> positions(pos_h * pos_w);
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
} }
} for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
set_input_i32("positions", positions);
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); // inspired from resampler of Qwen-VL:
free(pos_embed_data); // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
} // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
} int embed_dim = clip_n_mmproj_embd(ctx);
else {
if (model.class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings)); // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
memset(zero_mem, 0, ggml_nbytes(embeddings)); auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem);
}
if (ctx->has_qwen2vl_merger) { std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); for(int i = 0; i < pos_w * pos_h; ++i){
for(int j = 0; j < embed_dim; ++j){
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
}
}
const int pw = image_size_width / patch_size; set_input_f32("pos_embed", pos_embed);
const int ph = image_size_height / patch_size; } break;
int* positions_data = (int*)malloc(ggml_nbytes(positions)); case PROJECTOR_TYPE_QWEN2VL:
{
const int merge_ratio = 2;
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
std::vector<int> positions(num_positions * 4);
int ptr = 0;
for (int y = 0; y < ph; y += merge_ratio) {
for (int x = 0; x < pw; x += merge_ratio) {
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions[ ptr] = y + dy;
positions[ num_patches + ptr] = x + dx;
positions[2 * num_patches + ptr] = y + dy;
positions[3 * num_patches + ptr] = x + dx;
ptr++;
}
}
}
}
int ptr = 0; set_input_i32("positions", positions);
for (int y = 0; y < ph; y+=2) } break;
case PROJECTOR_TYPE_QWEN25VL:
{ {
for (int x = 0; x < pw; x+=2) // pw * ph = number of tokens output by ViT after apply patch merger
{ // ipw * ipw = number of vision token been processed inside ViT
for (int dy = 0; dy < 2; dy++) { const int merge_ratio = 2;
for (int dx = 0; dx < 2; dx++) { const int pw = image_size_width / patch_size / merge_ratio;
positions_data[ptr] = y + dy; const int ph = image_size_height / patch_size / merge_ratio;
positions_data[num_patches + ptr] = x + dx; const int ipw = image_size_width / patch_size;
positions_data[num_patches * 2 + ptr] = y + dy; const int iph = image_size_height / patch_size;
positions_data[num_patches * 3 + ptr] = x + dx;
ptr++; std::vector<int> idx (ph * pw);
std::vector<int> inv_idx(ph * pw);
if (use_window_attn) {
const int attn_window_size = 112;
const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
int mask_row = 0;
for (int y = 0; y < ph; y += grid_window) {
for (int x = 0; x < pw; x += grid_window) {
const int win_h = std::min(grid_window, ph - y);
const int win_w = std::min(grid_window, pw - x);
const int dst_0 = dst;
// group all tokens belong to the same window togather (to a continue range)
for (int dy = 0; dy < win_h; dy++) {
for (int dx = 0; dx < win_w; dx++) {
const int src = (y + dy) * pw + (x + dx);
GGML_ASSERT(src < (int)idx.size());
GGML_ASSERT(dst < (int)inv_idx.size());
idx [src] = dst;
inv_idx[dst] = src;
dst++;
}
}
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
int row_offset = mask_row * (ipw * iph);
std::fill(
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
0.0);
mask_row++;
}
} }
} }
set_input_i32("window_idx", idx);
set_input_i32("inv_window_idx", inv_idx);
set_input_f32("window_mask", mask);
} else {
for (int i = 0; i < ph * pw; i++) {
idx[i] = i;
}
} }
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); const int mpow = merge_ratio * merge_ratio;
free(positions_data); std::vector<int> positions(num_positions * 4);
}
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { int ptr = 0;
// do nothing for (int y = 0; y < iph; y += merge_ratio) {
} for (int x = 0; x < ipw; x += merge_ratio) {
else { for (int dy = 0; dy < 2; dy++) {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); for (int dx = 0; dx < 2; dx++) {
auto remap = idx[ptr / mpow];
remap = (remap * mpow) + (ptr % mpow);
positions[ remap] = y + dy;
positions[ num_patches + remap] = x + dx;
positions[2 * num_patches + remap] = y + dy;
positions[3 * num_patches + remap] = x + dx;
ptr++;
}
}
}
}
int* positions_data = (int*)malloc(ggml_nbytes(positions)); set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
std::vector<int> pos_data(num_positions);
// dimension H
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i / n_patches_per_col;
}
set_input_i32("pos_h", pos_data);
// dimension W
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i % n_patches_per_col;
}
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_GLM_EDGE:
{
// llava and other models
std::vector<int32_t> positions(num_positions);
for (int i = 0; i < num_positions; i++) { for (int i = 0; i < num_positions; i++) {
positions_data[i] = i; positions[i] = i;
} }
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); set_input_i32("positions", positions);
free(positions_data); } break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
case PROJECTOR_TYPE_LDPV2:
{
// llava and other models
std::vector<int32_t> positions(num_positions);
for (int i = 0; i < num_positions; i++) {
positions[i] = i;
}
set_input_i32("positions", positions);
if (!ctx->has_glm_projector) {
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
// The patches vector is used to get rows to index into the embeds with; // The patches vector is used to get rows to index into the embeds with;
// we should skip dim 0 only if we have CLS to avoid going out of bounds // we should skip dim 0 only if we have CLS to avoid going out of bounds
// when retrieving the rows. // when retrieving the rows.
int patch_offset = model.class_embedding ? 1 : 0; int patch_offset = model.class_embedding ? 1 : 0;
int* patches_data = (int*)malloc(ggml_nbytes(patches)); std::vector<int32_t> patches(num_patches);
for (int i = 0; i < num_patches; i++) { for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + patch_offset; patches[i] = i + patch_offset;
} }
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); set_input_i32("patches", patches);
free(patches_data); } break;
} case PROJECTOR_TYPE_GEMMA3:
} case PROJECTOR_TYPE_IDEFICS3:
{
// do nothing
} break;
default:
GGML_ABORT("Unknown projector type");
} }
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
...@@ -2676,13 +3377,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2676,13 +3377,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// copy the embeddings to the location passed by the user // copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
if (ctx->has_glm_projector) {
//eoi
ggml_tensor * eoi = ctx->vision_model.eoi_w;
int offset = ggml_nelements(embeddings);
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
}
return true; return true;
} }
...@@ -2822,56 +3516,52 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i ...@@ -2822,56 +3516,52 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
} }
int clip_n_mmproj_embd(const struct clip_ctx * ctx) { int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) { switch (ctx->proj_type) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; case PROJECTOR_TYPE_LDP:
} return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { case PROJECTOR_TYPE_LDPV2:
return ctx->vision_model.mm_model_peg_0_b->ne[0]; return ctx->vision_model.mm_model_peg_0_b->ne[0];
} case PROJECTOR_TYPE_MLP:
if (ctx->proj_type == PROJECTOR_TYPE_MLP) { case PROJECTOR_TYPE_PIXTRAL:
return ctx->vision_model.mm_2_b->ne[0]; return ctx->vision_model.mm_2_b->ne[0];
} case PROJECTOR_TYPE_MLP_NORM:
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0];
return ctx->vision_model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV:
} if (ctx->minicpmv_version == 2) {
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { return 4096;
if (ctx->minicpmv_version == 2) { } else if (ctx->minicpmv_version == 3) {
return 4096; return 3584;
} } else if (ctx->minicpmv_version == 4) {
else if (ctx->minicpmv_version == 3) { return 3584;
return 3584; }
} GGML_ABORT("Unknown minicpmv version");
else if (ctx->minicpmv_version == 4) { case PROJECTOR_TYPE_GLM_EDGE:
return 3584; return ctx->vision_model.mm_model_mlp_3_w->ne[1];
} case PROJECTOR_TYPE_QWEN2VL:
} case PROJECTOR_TYPE_QWEN25VL:
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ return ctx->vision_model.mm_1_b->ne[0];
return ctx->vision_model.mm_model_mlp_3_w->ne[1]; case PROJECTOR_TYPE_GEMMA3:
} return ctx->vision_model.mm_input_proj_w->ne[0];
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { case PROJECTOR_TYPE_IDEFICS3:
return ctx->vision_model.mm_1_b->ne[0]; return ctx->vision_model.projection->ne[1];
} default:
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { GGML_ABORT("Unknown projector type");
return ctx->vision_model.mm_input_proj_w->ne[0];
} }
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
} }
int clip_is_minicpmv(const struct clip_ctx * ctx) { int clip_is_minicpmv(const struct clip_ctx * ctx) {
if (ctx->has_minicpmv_projector) { if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
return ctx->minicpmv_version; return ctx->minicpmv_version;
} }
return 0; return 0;
} }
bool clip_is_glm(const struct clip_ctx * ctx) { bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->has_glm_projector; return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
} }
bool clip_is_qwen2vl(const struct clip_ctx * ctx) { bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger; return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
} }
bool clip_is_llava(const struct clip_ctx * ctx) { bool clip_is_llava(const struct clip_ctx * ctx) {
...@@ -2882,29 +3572,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { ...@@ -2882,29 +3572,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
} }
// Determine the number of encoder layers to iterate over
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
// Get the index of the second to last layer; this is the
// default for models that have a llava projector
const auto & hparams = ctx->vision_model.hparams;
int n_layer = hparams.n_layer - 1;
int deepest_feature_layer = -1;
// Handle other projectors; incrementing here indicates that we
// should use the last encoder layer for the vision features.
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
n_layer += 1;
}
// If we set explicit vision feature layers, only go up to the deepest one
for (const auto & feature_layer : hparams.vision_feature_layer) {
if (feature_layer > deepest_feature_layer) {
deepest_feature_layer = feature_layer;
}
}
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img; clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3); clip_img.buf.resize(h * w * 3);
......
...@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par ...@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
...@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); ...@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx); GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); "use clip_n_output_tokens instead");
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
"use clip_n_output_tokens instead");
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// this should be equal to the embedding dimension of the text model
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
...@@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); ...@@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
......
...@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< ...@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
} }
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
struct { struct {
struct ggml_context * ctx; struct ggml_context * ctx;
} model; } model;
...@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> ...@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base // fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) { for (size_t i = 1; i < num_images; i++) {
...@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> ...@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ): // append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip)); *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
// Debug: Test single segments // Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python // Current findings: sending base image, sending a segment embedding all works similar to python
...@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i], image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, nx, ny)); clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
} }
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
...@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
} }
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding // flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image\n"); LOG_ERR("Unable to encode image\n");
...@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out; int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
......
...@@ -111,6 +111,7 @@ extern "C" { ...@@ -111,6 +111,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
}; };
enum llama_rope_type { enum llama_rope_type {
...@@ -1237,6 +1238,7 @@ extern "C" { ...@@ -1237,6 +1238,7 @@ extern "C" {
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
......
...@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" }, { LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_STABLELM, "stablelm" },
...@@ -73,7 +74,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -73,7 +74,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
...@@ -109,6 +109,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -109,6 +109,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
...@@ -511,6 +512,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -511,6 +512,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_NOMIC_BERT_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{ {
LLM_ARCH_JINA_BERT_V2, LLM_ARCH_JINA_BERT_V2,
{ {
...@@ -1587,22 +1606,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1587,22 +1606,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
{
LLM_ARCH_MISTRAL3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
}
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
......
...@@ -24,6 +24,7 @@ enum llm_arch { ...@@ -24,6 +24,7 @@ enum llm_arch {
LLM_ARCH_REFACT, LLM_ARCH_REFACT,
LLM_ARCH_BERT, LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_JINA_BERT_V2, LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_BLOOM, LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM, LLM_ARCH_STABLELM,
...@@ -75,7 +76,6 @@ enum llm_arch { ...@@ -75,7 +76,6 @@ enum llm_arch {
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR, LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_MISTRAL3,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
...@@ -113,6 +113,7 @@ enum llm_kv { ...@@ -113,6 +113,7 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC, LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_POOLING_TYPE, LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE, LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_DECODER_START_TOKEN_ID,
......
...@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = { ...@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE }, { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
...@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = { ...@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX }, { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
{ "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "bailing", LLM_CHAT_TEMPLATE_BAILING },
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
}; };
llm_chat_template llm_chat_template_from_str(const std::string & name) { llm_chat_template llm_chat_template_from_str(const std::string & name) {
...@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { ...@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
if (tmpl_contains("<|im_start|>")) { if (tmpl_contains("<|im_start|>")) {
return tmpl_contains("<|im_sep|>") return tmpl_contains("<|im_sep|>")
? LLM_CHAT_TEMPLATE_PHI_4 ? LLM_CHAT_TEMPLATE_PHI_4
: LLM_CHAT_TEMPLATE_CHATML; : tmpl_contains("<end_of_utterance>")
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
: LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) { if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7; return LLM_CHAT_TEMPLATE_MISTRAL_V7;
...@@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { ...@@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
} }
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3; return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
return LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR; return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) { } else if (tmpl_contains("bos_token + message['role']")) {
...@@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { ...@@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_LLAMA_3; return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains("[gMASK]sop")) { } else if (tmpl_contains("[gMASK]sop")) {
// chatglm3-6b // chatglm3-6b
return LLM_CHAT_TEMPLATE_CHATGML_3; return LLM_CHAT_TEMPLATE_CHATGLM_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGML_4;
} else if (tmpl_contains(LU8("<用户>"))) { } else if (tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
return LLM_CHAT_TEMPLATE_MINICPM; return LLM_CHAT_TEMPLATE_MINICPM;
...@@ -432,7 +437,7 @@ int32_t llm_chat_apply_template( ...@@ -432,7 +437,7 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
// chatglm3-6b // chatglm3-6b
ss << "[gMASK]" << "sop"; ss << "[gMASK]" << "sop";
for (auto message : chat) { for (auto message : chat) {
...@@ -442,7 +447,7 @@ int32_t llm_chat_apply_template( ...@@ -442,7 +447,7 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>"; ss << "<|assistant|>";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
ss << "[gMASK]" << "<sop>"; ss << "[gMASK]" << "<sop>";
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
...@@ -451,14 +456,6 @@ int32_t llm_chat_apply_template( ...@@ -451,14 +456,6 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>"; ss << "<|assistant|>";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) { for (auto message : chat) {
...@@ -620,7 +617,23 @@ int32_t llm_chat_apply_template( ...@@ -620,7 +617,23 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "<|header_start|>assistant<|header_end|>\n\n"; ss << "<|header_start|>assistant<|header_end|>\n\n";
} }
} else { } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
// SmolVLM
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "\n\n";
} else if (role == "user") {
ss << "User: " << message->content << "<end_of_utterance>\n";
} else {
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else {
// template not supported // template not supported
return -1; return -1;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment