Unverified Commit 20c5fd39 authored by Devon Rifkin's avatar Devon Rifkin Committed by GitHub
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents d2ee599d 6e9a7a25
package ggml
import (
"bytes"
"os"
"slices"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestWriteGGUF(t *testing.T) {
w, err := os.CreateTemp(t.TempDir(), "*.bin")
if err != nil {
t.Fatal(err)
}
defer w.Close()
if err := WriteGGUF(w, KV{
"general.alignment": uint32(16),
}, []*Tensor{
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
}); err != nil {
t.Fatal(err)
}
r, err := os.Open(w.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
ff, _, err := Decode(r, 0)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(ff.KV(), KV{
"general.alignment": uint32(16),
"general.parameter_count": uint64(36),
}); diff != "" {
t.Errorf("Mismatch (-want +got):\n%s", diff)
}
if diff := cmp.Diff(ff.Tensors(), Tensors{
Offset: 336,
items: []*Tensor{
{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
},
}, cmp.AllowUnexported(Tensors{})); diff != "" {
t.Errorf("Mismatch (-want +got):\n%s", diff)
}
}
package ggml
import "fmt"
import (
"fmt"
"log/slog"
"strings"
)
type fileType uint32
// FileType is the Go equivalent to llama_ftype used for gguf file typing
type FileType uint32
const (
fileTypeF32 fileType = iota
fileTypeF16
fileTypeQ4_0
fileTypeQ4_1
fileTypeQ4_1_F16
fileTypeQ4_2 // unused
fileTypeQ4_3 // unused
fileTypeQ8_0
fileTypeQ5_0
fileTypeQ5_1
fileTypeQ2_K
fileTypeQ3_K_S
fileTypeQ3_K_M
fileTypeQ3_K_L
fileTypeQ4_K_S
fileTypeQ4_K_M
fileTypeQ5_K_S
fileTypeQ5_K_M
fileTypeQ6_K
fileTypeIQ2_XXS
fileTypeIQ2_XS
fileTypeQ2_K_S
fileTypeIQ3_XS
fileTypeIQ3_XXS
fileTypeIQ1_S
fileTypeIQ4_NL
fileTypeIQ3_S
fileTypeIQ3_M
fileTypeIQ2_S
fileTypeIQ2_M
fileTypeIQ4_XS
fileTypeIQ1_M
fileTypeBF16
FileTypeF32 FileType = iota
FileTypeF16
FileTypeQ4_0
FileTypeQ4_1
fileTypeQ4_1_F16 // unused by GGML
fileTypeQ4_2 // unused by GGML
fileTypeQ4_3 // unused by GGML
FileTypeQ8_0
FileTypeQ5_0
FileTypeQ5_1
FileTypeQ2_K
FileTypeQ3_K_S
FileTypeQ3_K_M
FileTypeQ3_K_L
FileTypeQ4_K_S
FileTypeQ4_K_M
FileTypeQ5_K_S
FileTypeQ5_K_M
FileTypeQ6_K
fileTypeIQ2_XXS // not supported by ollama
fileTypeIQ2_XS // not supported by ollama
FileTypeQ2_K_S
fileTypeIQ3_XS // not supported by ollama
fileTypeIQ3_XXS // not supported by ollama
fileTypeIQ1_S // not supported by ollama
fileTypeIQ4_NL // not supported by ollama
fileTypeIQ3_S // not supported by ollama
fileTypeIQ3_M // not supported by ollama
fileTypeIQ2_S // not supported by ollama
fileTypeIQ2_M // not supported by ollama
fileTypeIQ4_XS // not supported by ollama
fileTypeIQ1_M // not supported by ollama
FileTypeBF16
fileTypeQ4_0_4_4 // unused by GGML
fileTypeQ4_0_4_8 // unused by GGML
fileTypeQ4_0_8_8 // unused by GGML
fileTypeTQ1_0 // not supported by ollama
fileTypeTQ2_0 // not supported by ollama
fileTypeUnknown
FileTypeUnknown = 1024
)
func ParseFileType(s string) (fileType, error) {
// ParseFileType parses the provided GGUF file type
// Only Ollama supported types are considered valid
func ParseFileType(s string) (FileType, error) {
switch s {
case "F32":
return fileTypeF32, nil
return FileTypeF32, nil
case "F16":
return fileTypeF16, nil
return FileTypeF16, nil
case "Q4_0":
return fileTypeQ4_0, nil
return FileTypeQ4_0, nil
case "Q4_1":
return fileTypeQ4_1, nil
case "Q4_1_F16":
return fileTypeQ4_1_F16, nil
return FileTypeQ4_1, nil
case "Q8_0":
return fileTypeQ8_0, nil
return FileTypeQ8_0, nil
case "Q5_0":
return fileTypeQ5_0, nil
return FileTypeQ5_0, nil
case "Q5_1":
return fileTypeQ5_1, nil
return FileTypeQ5_1, nil
case "Q2_K":
return fileTypeQ2_K, nil
return FileTypeQ2_K, nil
case "Q3_K_S":
return fileTypeQ3_K_S, nil
return FileTypeQ3_K_S, nil
case "Q3_K_M":
return fileTypeQ3_K_M, nil
return FileTypeQ3_K_M, nil
case "Q3_K_L":
return fileTypeQ3_K_L, nil
return FileTypeQ3_K_L, nil
case "Q4_K_S":
return fileTypeQ4_K_S, nil
case "Q4_K_M":
return fileTypeQ4_K_M, nil
return FileTypeQ4_K_S, nil
case "Q4_K_M", "Q4_K":
return FileTypeQ4_K_M, nil
case "Q5_K_S":
return fileTypeQ5_K_S, nil
case "Q5_K_M":
return fileTypeQ5_K_M, nil
return FileTypeQ5_K_S, nil
case "Q5_K_M", "Q5_K":
return FileTypeQ5_K_M, nil
case "Q6_K":
return fileTypeQ6_K, nil
case "IQ2_XXS":
return fileTypeIQ2_XXS, nil
case "IQ2_XS":
return fileTypeIQ2_XS, nil
return FileTypeQ6_K, nil
case "Q2_K_S":
return fileTypeQ2_K_S, nil
case "IQ3_XS":
return fileTypeIQ3_XS, nil
case "IQ3_XXS":
return fileTypeIQ3_XXS, nil
case "IQ1_S":
return fileTypeIQ1_S, nil
case "IQ4_NL":
return fileTypeIQ4_NL, nil
case "IQ3_S":
return fileTypeIQ3_S, nil
case "IQ3_M":
return fileTypeIQ3_M, nil
case "IQ2_S":
return fileTypeIQ2_S, nil
case "IQ2_M":
return fileTypeIQ2_M, nil
case "IQ4_XS":
return fileTypeIQ4_XS, nil
case "IQ1_M":
return fileTypeIQ1_M, nil
return FileTypeQ2_K_S, nil
case "BF16":
return fileTypeBF16, nil
return FileTypeBF16, nil
default:
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
supportedFileTypes := []FileType{
FileTypeF32,
FileTypeF16,
FileTypeQ4_K_S,
FileTypeQ4_K_M,
FileTypeQ8_0,
// fsggml.FileTypeBF16, // TODO
}
strs := make([]string, len(supportedFileTypes))
for i := range supportedFileTypes {
strs[i] = supportedFileTypes[i].String()
}
return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", "))
}
}
func (t fileType) String() string {
func (t FileType) String() string {
switch t {
case fileTypeF32:
case FileTypeF32:
return "F32"
case fileTypeF16:
case FileTypeF16:
return "F16"
case fileTypeQ4_0:
case FileTypeQ4_0:
return "Q4_0"
case fileTypeQ4_1:
case FileTypeQ4_1:
return "Q4_1"
case fileTypeQ4_1_F16:
return "Q4_1_F16"
case fileTypeQ8_0:
case FileTypeQ8_0:
return "Q8_0"
case fileTypeQ5_0:
case FileTypeQ5_0:
return "Q5_0"
case fileTypeQ5_1:
case FileTypeQ5_1:
return "Q5_1"
case fileTypeQ2_K:
case FileTypeQ2_K:
return "Q2_K"
case fileTypeQ3_K_S:
case FileTypeQ3_K_S:
return "Q3_K_S"
case fileTypeQ3_K_M:
case FileTypeQ3_K_M:
return "Q3_K_M"
case fileTypeQ3_K_L:
case FileTypeQ3_K_L:
return "Q3_K_L"
case fileTypeQ4_K_S:
case FileTypeQ4_K_S:
return "Q4_K_S"
case fileTypeQ4_K_M:
case FileTypeQ4_K_M:
return "Q4_K_M"
case fileTypeQ5_K_S:
case FileTypeQ5_K_S:
return "Q5_K_S"
case fileTypeQ5_K_M:
case FileTypeQ5_K_M:
return "Q5_K_M"
case fileTypeQ6_K:
case FileTypeQ6_K:
return "Q6_K"
case fileTypeIQ2_XXS:
return "IQ2_XXS"
case fileTypeIQ2_XS:
return "IQ2_XS"
case fileTypeQ2_K_S:
case FileTypeQ2_K_S:
return "Q2_K_S"
case fileTypeIQ3_XS:
return "IQ3_XS"
case fileTypeIQ3_XXS:
return "IQ3_XXS"
case fileTypeIQ1_S:
return "IQ1_S"
case fileTypeIQ4_NL:
return "IQ4_NL"
case fileTypeIQ3_S:
return "IQ3_S"
case fileTypeIQ3_M:
return "IQ3_M"
case fileTypeIQ2_S:
return "IQ2_S"
case fileTypeIQ4_XS:
return "IQ4_XS"
case fileTypeIQ2_M:
return "IQ2_M"
case fileTypeIQ1_M:
return "IQ1_M"
case fileTypeBF16:
case FileTypeBF16:
return "BF16"
default:
return "unknown"
}
}
func (t fileType) Value() uint32 {
func (t FileType) Value() uint32 {
return uint32(t)
}
func (ftype FileType) ToTensorType() TensorType {
switch ftype {
case FileTypeF32:
return TensorTypeF32
case FileTypeF16:
return TensorTypeF16
case FileTypeQ4_0:
return TensorTypeQ4_0
case FileTypeQ4_1:
return TensorTypeQ4_1
case FileTypeQ8_0:
return TensorTypeQ8_0
case FileTypeQ5_0:
return TensorTypeQ5_0
case FileTypeQ5_1:
return TensorTypeQ5_1
case FileTypeQ2_K:
return TensorTypeQ2_K
case FileTypeQ3_K_S:
return TensorTypeQ3_K
case FileTypeQ3_K_M:
return TensorTypeQ3_K
case FileTypeQ3_K_L:
return TensorTypeQ3_K
case FileTypeQ4_K_S:
return TensorTypeQ4_K
case FileTypeQ4_K_M:
return TensorTypeQ4_K
case FileTypeQ5_K_S:
return TensorTypeQ5_K
case FileTypeQ5_K_M:
return TensorTypeQ5_K
case FileTypeQ6_K:
return TensorTypeQ6_K
case FileTypeQ2_K_S:
return TensorTypeQ2_K
case FileTypeBF16:
return TensorTypeBF16
default:
slog.Warn("unsupported file type", "type", ftype)
return 0 // F32
}
}
// TensorType is equivalent to ggml_type for individual tensor types
// Note: these are not the same as FileType
type TensorType uint32
const (
TensorTypeF32 TensorType = iota
TensorTypeF16
TensorTypeQ4_0
TensorTypeQ4_1
tensorTypeQ4_2 // unused by GGML
tensorTypeQ4_3 // unused by GGML
TensorTypeQ5_0
TensorTypeQ5_1
TensorTypeQ8_0
TensorTypeQ8_1
TensorTypeQ2_K
TensorTypeQ3_K
TensorTypeQ4_K
TensorTypeQ5_K
TensorTypeQ6_K
TensorTypeQ8_K
tensorTypeIQ2_XXS // not supported by ollama
tensorTypeIQ2_XS // not supported by ollama
tensorTypeIQ3_XXS // not supported by ollama
tensorTypeIQ1_S // not supported by ollama
tensorTypeIQ4_NL // not supported by ollama
tensorTypeIQ3_S // not supported by ollama
tensorTypeIQ2_S // not supported by ollama
tensorTypeIQ4_XS // not supported by ollama
TensorTypeI8
TensorTypeI16
TensorTypeI32
TensorTypeI64
TensorTypeF64
tensorTypeIQ1_M // not supported by ollama
TensorTypeBF16
tensorTypeQ4_0_4_4 // unused by GGML
tensorTypeQ4_0_4_8 // unused by GGML
tensorTypeQ4_0_8_8 // unused by GGML
tensorTypeTQ1_0 // not supported by ollama
tensorTypeTQ2_0 // not supported by ollama
tensorTypeIQ4_NL_4_4 // unused by GGML
tensorTypeIQ4_NL_4_8 // unused by GGML
tensorTypeIQ4_NL_8_8 // unused by GGML
)
// ParseFileType parses the provided GGUF file type
// Only Ollama supported types are considered valid
func ParseTensorType(s string) (TensorType, error) {
switch s {
case "F32":
return TensorTypeF32, nil
case "F16":
return TensorTypeF16, nil
case "Q4_0":
return TensorTypeQ4_0, nil
case "Q4_1":
return TensorTypeQ4_1, nil
case "Q5_0":
return TensorTypeQ5_0, nil
case "Q5_1":
return TensorTypeQ5_1, nil
case "Q8_0":
return TensorTypeQ8_0, nil
case "Q8_1":
return TensorTypeQ8_1, nil
case "Q2_K":
return TensorTypeQ2_K, nil
case "Q3_K":
return TensorTypeQ3_K, nil
case "Q4_K":
return TensorTypeQ4_K, nil
case "Q5_K":
return TensorTypeQ5_K, nil
case "Q6_K":
return TensorTypeQ6_K, nil
case "Q8_K":
return TensorTypeQ8_K, nil
case "F64":
return TensorTypeF64, nil
case "BF16":
return TensorTypeBF16, nil
default:
return 0, fmt.Errorf("unsupported quantization type %s", s)
}
}
func (t TensorType) IsQuantized() bool {
switch t {
case TensorTypeF32, TensorTypeF16, TensorTypeBF16:
return false
default:
return true
}
}
func (t TensorType) RowSize(ne uint64) uint64 {
return t.TypeSize() * ne / t.BlockSize()
}
func (t TensorType) String() string {
switch t {
case TensorTypeF32:
return "F32"
case TensorTypeF16:
return "F16"
case TensorTypeQ4_0:
return "Q4_0"
case TensorTypeQ4_1:
return "Q4_1"
case TensorTypeQ5_0:
return "Q5_0"
case TensorTypeQ5_1:
return "Q5_1"
case TensorTypeQ8_0:
return "Q8_0"
case TensorTypeQ8_1:
return "Q8_1"
case TensorTypeQ2_K:
return "Q2_K"
case TensorTypeQ3_K:
return "Q3_K"
case TensorTypeQ4_K:
return "Q4_K"
case TensorTypeQ5_K:
return "Q5_K"
case TensorTypeQ6_K:
return "Q6_K"
case TensorTypeQ8_K:
return "Q8_K"
case TensorTypeF64:
return "F64"
case TensorTypeBF16:
return "BF16"
default:
return "unknown"
}
}
......@@ -11,7 +11,7 @@ require (
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.9.0
github.com/x448/float16 v0.8.4
golang.org/x/sync v0.11.0
golang.org/x/sync v0.12.0
)
require (
......@@ -70,12 +70,12 @@ require (
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.33.0
golang.org/x/crypto v0.36.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
golang.org/x/net v0.35.0 // indirect
golang.org/x/sys v0.30.0
golang.org/x/term v0.29.0
golang.org/x/text v0.22.0
golang.org/x/net v0.38.0 // indirect
golang.org/x/sys v0.31.0
golang.org/x/term v0.30.0
golang.org/x/text v0.23.0
google.golang.org/protobuf v1.34.1
gopkg.in/yaml.v3 v3.0.1 // indirect
)
......@@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
......@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
......@@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
......@@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
......
......@@ -34,13 +34,15 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
func TestAllMiniLMEmbeddings(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbeddingRequest{
Model: "all-minilm",
Prompt: "why is the sky blue?",
}
res, err := embeddingTestHelper(ctx, t, req)
res, err := embeddingTestHelper(ctx, client, t, req)
if err != nil {
t.Fatalf("error: %v", err)
......@@ -62,13 +64,15 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
func TestAllMiniLMEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
}
res, err := embedTestHelper(ctx, t, req)
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatalf("error: %v", err)
......@@ -98,13 +102,15 @@ func TestAllMiniLMEmbed(t *testing.T) {
func TestAllMiniLMBatchEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"why is the sky blue?", "why is the grass green?"},
}
res, err := embedTestHelper(ctx, t, req)
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatalf("error: %v", err)
......@@ -144,6 +150,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
func TestAllMiniLMEmbedTruncate(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
truncTrue, truncFalse := true, false
......@@ -182,7 +190,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
res := make(map[string]*api.EmbedResponse)
for _, req := range reqs {
response, err := embedTestHelper(ctx, t, req.Request)
response, err := embedTestHelper(ctx, client, t, req.Request)
if err != nil {
t.Fatalf("error: %v", err)
}
......@@ -198,7 +206,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
}
// check that truncate set to false returns an error if context length is exceeded
_, err := embedTestHelper(ctx, t, api.EmbedRequest{
_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
Truncate: &truncFalse,
......@@ -210,9 +218,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
}
}
func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}
......@@ -226,9 +232,7 @@ func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingReq
return response, nil
}
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}
......
......@@ -48,17 +48,6 @@ var (
}
)
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
deadline, hasDeadline := t.Deadline()
if !hasDeadline {
return 8 * time.Minute, 10 * time.Minute
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
t.Skip("too little time")
return time.Duration(0), time.Duration(0)
}
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
}
func TestModelsGenerate(t *testing.T) {
softTimeout, hardTimeout := getTimeouts(t)
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
......
//go:build integration && models
package integration
import (
"bytes"
"context"
"fmt"
"log/slog"
"strings"
"testing"
"time"
"github.com/ollama/ollama/api"
)
func TestQuantization(t *testing.T) {
sourceModels := []string{
"qwen2.5:0.5b-instruct-fp16",
}
quantizations := []string{
"Q8_0",
"Q4_K_S",
"Q4_K_M",
"Q4_K",
}
softTimeout, hardTimeout := getTimeouts(t)
started := time.Now()
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
for _, base := range sourceModels {
if err := PullIfMissing(ctx, client, base); err != nil {
t.Fatalf("pull failed %s", err)
}
for _, quant := range quantizations {
newName := fmt.Sprintf("%s__%s", base, quant)
t.Run(newName, func(t *testing.T) {
if time.Now().Sub(started) > softTimeout {
t.Skip("skipping remaining tests to avoid excessive runtime")
}
req := &api.CreateRequest{
Model: newName,
Quantization: quant,
From: base,
}
fn := func(resp api.ProgressResponse) error {
// fmt.Print(".")
return nil
}
t.Logf("quantizing: %s -> %s", base, quant)
if err := client.Create(ctx, req, fn); err != nil {
t.Fatalf("create failed %s", err)
}
defer func() {
req := &api.DeleteRequest{
Model: newName,
}
t.Logf("deleting: %s -> %s", base, quant)
if err := client.Delete(ctx, req); err != nil {
t.Logf("failed to clean up %s: %s", req.Model, err)
}
}()
// Check metadata on the model
resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
if err != nil {
t.Fatalf("unable to show model: %s", err)
}
if !strings.Contains(resp.Details.QuantizationLevel, quant) {
t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
}
stream := true
genReq := api.GenerateRequest{
Model: newName,
Prompt: "why is the sky blue?",
KeepAlive: &api.Duration{Duration: 3 * time.Second},
Options: map[string]any{
"seed": 42,
"temperature": 0.0,
},
Stream: &stream,
}
t.Logf("verifying: %s -> %s", base, quant)
// Some smaller quantizations can cause models to have poor quality
// or get stuck in repetition loops, so we stop as soon as we have any matches
anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
reqCtx, reqCancel := context.WithCancel(ctx)
atLeastOne := false
var buf bytes.Buffer
genfn := func(response api.GenerateResponse) error {
buf.Write([]byte(response.Response))
fullResp := strings.ToLower(buf.String())
for _, resp := range anyResp {
if strings.Contains(fullResp, resp) {
atLeastOne = true
t.Log(fullResp)
reqCancel()
break
}
}
return nil
}
done := make(chan int)
var genErr error
go func() {
genErr = client.Generate(reqCtx, &genReq, genfn)
done <- 0
}()
select {
case <-done:
if genErr != nil && !atLeastOne {
t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
}
case <-ctx.Done():
t.Error("outer test context done while waiting for generate")
}
t.Logf("passed")
})
}
}
}
......@@ -217,6 +217,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
return
}
defer fp.Close()
data, err := io.ReadAll(fp)
if err != nil {
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
......@@ -358,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
}
}
}
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
deadline, hasDeadline := t.Deadline()
if !hasDeadline {
return 8 * time.Minute, 10 * time.Minute
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
t.Skip("too little time")
return time.Duration(0), time.Duration(0)
}
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
}
......@@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) {
}
}
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
}
func (c *Causal) updateSlidingWindow() {
......
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";
......@@ -342,6 +342,8 @@ struct common_params {
// multimodal models (see examples/llava)
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// embedding
......
......@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (max_items == 0) {
return "";
}
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
}
......
......@@ -2,8 +2,6 @@
#include "gguf.h"
#include "clip.h"
#include "clip.h"
#include <climits>
#include <cstdarg>
#include <string>
......@@ -17,33 +15,31 @@
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count"
#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_N_EMBD "clip.vision.embedding_length"
#define KEY_N_FF "clip.vision.feed_forward_length"
#define KEY_N_BLOCK "clip.vision.block_count"
#define KEY_N_HEAD "clip.vision.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.vision.projection_dim"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
//
......@@ -60,7 +56,9 @@
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
......@@ -72,6 +70,8 @@
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
......@@ -87,18 +87,19 @@
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_MINICPMV,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
PROJECTOR_TYPE_UNKNOWN,
};
......@@ -106,10 +107,13 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
......
......@@ -28,6 +28,7 @@
#include <cinttypes>
#include <limits>
#include <array>
#include <numeric>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
......@@ -172,14 +173,18 @@ struct clip_hparams {
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
int32_t proj_scale_factor = 0; // idefics3
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
float eps;
float eps = 1e-6;
float rope_theta = 0.0;
std::vector<int32_t> image_grid_pinpoints;
int32_t image_crop_resolution;
std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
};
struct clip_layer {
......@@ -199,11 +204,20 @@ struct clip_layer {
struct ggml_tensor * ln_1_b = nullptr;
// ff
struct ggml_tensor * ff_i_w = nullptr;
struct ggml_tensor * ff_i_b = nullptr;
struct ggml_tensor * ff_i_w = nullptr; // legacy naming
struct ggml_tensor * ff_i_b = nullptr; // legacy naming
struct ggml_tensor * ff_o_w = nullptr; // legacy naming
struct ggml_tensor * ff_o_b = nullptr; // legacy naming
struct ggml_tensor * ff_o_w = nullptr;
struct ggml_tensor * ff_o_b = nullptr;
struct ggml_tensor * ff_up_w = nullptr;
struct ggml_tensor * ff_up_b = nullptr;
struct ggml_tensor * ff_gate_w = nullptr;
struct ggml_tensor * ff_gate_b = nullptr;
struct ggml_tensor * ff_down_w = nullptr;
struct ggml_tensor * ff_down_b = nullptr;
struct ggml_tensor * ff_g_w = NULL;
struct ggml_tensor * ff_g_b = NULL;
// layernorm 2
struct ggml_tensor * ln_2_w = nullptr;
......@@ -249,8 +263,6 @@ struct clip_vision_model {
//GLMV-Edge projection
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
struct ggml_tensor * boi_w = nullptr;
struct ggml_tensor * eoi_w = nullptr;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
......@@ -309,16 +321,14 @@ struct clip_vision_model {
// gemma3
struct ggml_tensor * mm_input_proj_w = nullptr;
struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
// pixtral
struct ggml_tensor * token_embd_img_break = nullptr;
};
struct clip_ctx {
bool has_text_encoder = false;
bool has_vision_encoder = false;
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
bool has_glm_projector = false;
bool has_qwen2vl_merger = false;
int minicpmv_version = 2;
int minicpmv_version = 0;
struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
......@@ -341,6 +351,7 @@ struct clip_ctx {
ggml_backend_t backend_cpu;
ggml_backend_buffer_ptr buf;
int max_nodes = 8192;
ggml_backend_sched_ptr sched;
clip_image_size load_image_size;
......@@ -376,23 +387,20 @@ struct clip_ctx {
}
};
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
int image_size_width = img.nx;
int image_size_height = img.ny;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
......@@ -519,6 +527,35 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
embeddings = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
embeddings);
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
ggml_tensor * cur = embeddings;
const int scale_factor = model.hparams.proj_scale_factor;
const int n_embd = cur->ne[0];
const int seq = cur->ne[1];
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = std::sqrt(seq);
const int width = std::sqrt(seq);
GGML_ASSERT(scale_factor != 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
seq / (scale_factor * scale_factor),
bsz);
cur = ggml_mul_mat(ctx0, model.projection, cur);
embeddings = cur;
} else {
GGML_ABORT("SigLIP: Unsupported projector type");
}
// build the graph
......@@ -527,19 +564,462 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
return gf;
}
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr;
// implementation of the 2D RoPE without adding a new op in ggml
// this is not efficient (use double the memory), but works on all backends
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
static ggml_tensor * build_rope_2d(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * pos_h,
ggml_tensor * pos_w,
const float freq_base
) {
const int64_t n_dim = cur->ne[0];
const int64_t n_head = cur->ne[1];
const int64_t n_pos = cur->ne[2];
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
// first half of cur will use 1e-0, 1e-2 (even)
// second half of cur will use 1e-1, 1e-3 (odd)
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
// then for the second half, we use freq_scale to shift the inv_freq
// ^ why? replace (2i) with (2i+1) in the above equation
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
// first half
ggml_tensor * first;
{
first = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
ggml_row_size(cur->type, n_dim),
ggml_row_size(cur->type, n_dim*n_head),
0);
first = ggml_rope_ext(
ctx0,
first,
pos_h, // positions
nullptr, // freq factors
n_dim/2, // n_dims
0, 0, freq_base,
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
);
}
// second half
ggml_tensor * second;
{
second = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
ggml_row_size(cur->type, n_dim),
ggml_row_size(cur->type, n_dim*n_head),
n_dim/2 * ggml_element_size(cur));
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
second = ggml_rope_ext(
ctx0,
second,
pos_w, // positions
nullptr, // freq factors
n_dim/2, // n_dims
0, 0, freq_base,
freq_scale_odd,
0.0f, 1.0f, 0.0f, 0.0f
);
}
cur = ggml_concat(ctx0, first, second, 0);
return cur;
}
static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
int image_size_width = img.nx;
int image_size_height = img.ny;
const int patch_size = hparams.patch_size;
const int n_patches_x = image_size_width / patch_size;
const int n_patches_y = image_size_height / patch_size;
const int num_patches = n_patches_x * n_patches_y;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx0_ptr(ggml_init(params));
auto ctx0 = ctx0_ptr.get();
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// input raw
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
// 2D input positions
struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
struct ggml_tensor * embeddings = inp;
// pre-layer norm
embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
// loop over layers
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings;
// pre-attention norm
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
// self-attention
{
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, embeddings);
embeddings = cur; // embeddings = residual, cur = hidden_states
// pre-ffn norm
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
// feed-forward
{
ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur);
gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
cur = ggml_mul(ctx0, up_proj, gate_proj);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
}
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// LlavaMultiModalProjector (with GELU activation)
{
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
// arrangement of the [IMG_BREAK] token
{
// not efficient, but works
// the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
// after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
const int n_embd_text = embeddings->ne[0];
const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
cur = ggml_concat(ctx0, cur, tok, 1);
embeddings = ggml_view_2d(ctx0, cur,
n_embd_text, n_tokens_output,
ggml_row_size(cur->type, n_embd_text), 0);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}
static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size_width = imgs.entries[0]->nx;
const int image_size_height = imgs.entries[0]->ny;
const bool use_window_attn = hparams.n_wa_pattern > 0;
const int n_wa_pattern = hparams.n_wa_pattern;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs.entries.size();
GGML_ASSERT(batch_size == 1);
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx0_ptr(ggml_init(params));
auto ctx0 = ctx0_ptr.get();
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
if (model.patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
struct ggml_tensor * embeddings = inp;
struct ggml_tensor * window_mask = nullptr;
struct ggml_tensor * window_idx = nullptr;
struct ggml_tensor * inv_window_idx = nullptr;
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
}
if (use_window_attn) {
// handle window attention inputs
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
// rmsnorm1
cur = ggml_rms_norm(ctx0, cur, eps);
cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
// self-attention
{
struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
Q = ggml_rope_multi(
ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
struct ggml_tensor * K =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_rope_multi(
ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
struct ggml_tensor * V =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
if (full_attn) {
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
} else {
KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
}
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
}
// attention output
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, embeddings);
embeddings = cur; // embeddings = residual, cur = hidden_states
// rms norm2
cur = ggml_rms_norm(ctx0, cur, eps);
cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
// mlp
// ffn_up
auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
// TODO : only 2 of these 3 are actually used, should we remove one of them?
if (ctx->use_gelu) {
cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
} else if (ctx->use_silu) {
cur_gate = ggml_silu_inplace(ctx0, cur_gate);
} else {
cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
}
cur = ggml_mul(ctx0, cur_gate, cur_up);
// ffn_down
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// post-layernorm
if (model.post_ln_w) {
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
}
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
if (use_window_attn) {
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
// embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector) {
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
image_size_width = load_image_size.width;
image_size_height = load_image_size.height;
......@@ -548,7 +1028,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
image_size_height = imgs.entries[0]->ny;
}
}
else if (ctx->has_qwen2vl_merger) {
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
// use the image's native resolution when image is avaible
if (is_inf) {
// if (imgs->data->nx && imgs->data->ny) {
......@@ -556,12 +1037,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
image_size_height = imgs.entries[0]->ny;
}
}
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
......@@ -570,7 +1052,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
const int batch_size = imgs.entries.size();
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
if (ctx->has_llava_projector
|| ctx->proj_type == PROJECTOR_TYPE_MINICPMV
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
GGML_ASSERT(batch_size == 1);
}
......@@ -591,8 +1075,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
if (ctx->has_qwen2vl_merger) {
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
......@@ -621,40 +1105,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
struct ggml_tensor * embeddings = inp;
struct ggml_tensor * pos_embed = nullptr;
if (ctx->has_llava_projector) {
// concat class_embeddings and patch_embeddings
if (model.class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
// concat class_embeddings and patch_embeddings
if (model.class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
}
if (ctx->has_minicpmv_projector) {
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
if (ctx->minicpmv_version == 2) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 4) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
int n_output_dim = clip_n_mmproj_embd(ctx);
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
......@@ -697,7 +1171,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
Q = ggml_rope_multi(
ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
......@@ -709,7 +1183,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
K = ggml_rope_multi(
ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
......@@ -974,106 +1448,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
}
}
// minicpmv projector
else if (ctx->has_minicpmv_projector)
{
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
struct ggml_tensor * q = model.mm_model_query;
{ // layernorm
q = ggml_norm(ctx0, q, eps);
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
struct ggml_tensor * q = model.mm_model_query;
{ // layernorm
q = ggml_norm(ctx0, q, eps);
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
}
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
{ // layernorm
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
}
struct ggml_tensor * k;
{ // position
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
k = ggml_add(ctx0, v, pos_embed);
}
{ // attention
int hidden_size = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = hidden_size/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
num_query = 96;
}
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
{ // layernorm
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
else if (ctx->minicpmv_version == 3) {
num_query = 64;
}
struct ggml_tensor * k;
{ // position
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
k = ggml_add(ctx0, v, pos_embed);
else if (ctx->minicpmv_version == 4) {
num_query = 64;
}
{ // attention
int hidden_size = 4096;
const int d_head = 128;
int n_head = hidden_size/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
hidden_size = 4096;
n_head = hidden_size/d_head;
num_query = 96;
}
else if (ctx->minicpmv_version == 3) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
else if (ctx->minicpmv_version == 4) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
// permute
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
// permute
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
}
{ // layernorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
}
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
}
else {
GGML_ASSERT(false);
{ // layernorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
}
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
}
// glm projector
else if (ctx->has_glm_projector) {
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
//GLU
{
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
embeddings = ggml_gelu_inplace(ctx0, embeddings);
struct ggml_tensor * x = embeddings;
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
embeddings = ggml_silu_inplace(ctx0, embeddings);
embeddings = ggml_mul(ctx0, embeddings,x);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
}
} else {
GGML_ABORT("fatal error");
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
// GLU
{
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
embeddings = ggml_gelu_inplace(ctx0, embeddings);
struct ggml_tensor * x = embeddings;
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
embeddings = ggml_silu_inplace(ctx0, embeddings);
embeddings = ggml_mul(ctx0, embeddings,x);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
}
}
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
......@@ -1094,12 +1554,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
}
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
return clip_image_build_graph_siglip(ctx, imgs);
} else {
// TODO: we should have one build_* function per model
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
ggml_cgraph * res;
switch (ctx->proj_type) {
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
{
GGML_ASSERT(imgs.entries.size() == 1);
res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
GGML_ASSERT(imgs.entries.size() == 1);
res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
} break;
case PROJECTOR_TYPE_QWEN25VL:
{
res = clip_image_build_graph_qwen25vl(ctx, imgs);
} break;
default:
{
// TODO: we should have one build_* function per model
res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
} break;
}
return res;
}
struct clip_model_loader {
......@@ -1109,7 +1587,7 @@ struct clip_model_loader {
clip_ctx & ctx_clip;
std::string fname;
size_t model_size; // in bytes
size_t model_size = 0; // in bytes
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
......@@ -1160,9 +1638,11 @@ struct clip_model_loader {
}
void load_hparams() {
auto & hparams = ctx_clip.vision_model.hparams;
// projector type
std::string proj_type;
{
std::string proj_type;
get_string(KEY_PROJ_TYPE, proj_type, false);
if (!proj_type.empty()) {
ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
......@@ -1174,34 +1654,27 @@ struct clip_model_loader {
// other hparams
{
get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false);
get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false);
GGML_ASSERT(ctx_clip.has_vision_encoder);
GGML_ASSERT(!ctx_clip.has_text_encoder);
// legacy keys, use KEY_PROJ_TYPE instead
get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false);
get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false);
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false);
get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false);
// !!! do NOT extend the list above, use KEY_PROJ_TYPE instead
get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
auto & hparams = ctx_clip.vision_model.hparams;
get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size);
get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head);
get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate);
get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer);
get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim);
get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps);
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_u32(KEY_N_EMBD, hparams.hidden_size);
get_u32(KEY_N_HEAD, hparams.n_head);
get_u32(KEY_N_FF, hparams.n_intermediate);
get_u32(KEY_N_BLOCK, hparams.n_layer);
get_u32(KEY_PROJ_DIM, hparams.projection_dim);
get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
{
std::string mm_patch_merge_type;
get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
......@@ -1234,15 +1707,62 @@ struct clip_model_loader {
for (auto & layer : vision_feature_layer) {
hparams.vision_feature_layer.insert(layer);
}
// Calculate the deepest feature layer based on hparams and projector type
ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip);
// NOTE: This is only used by build_graph_legacy()
{
// Get the index of the second to last layer; this is the default for models that have a llava projector
int n_layer = hparams.n_layer - 1;
int deepest_feature_layer = -1;
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
|| ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
n_layer += 1;
}
// If we set explicit vision feature layers, only go up to the deepest one
// NOTE: only used by granite-vision models for now
for (const auto & feature_layer : hparams.vision_feature_layer) {
if (feature_layer > deepest_feature_layer) {
deepest_feature_layer = feature_layer;
}
}
ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
}
// model-specific params
switch (ctx_clip.proj_type) {
case PROJECTOR_TYPE_MINICPMV:
{
if (ctx_clip.minicpmv_version == 0) {
ctx_clip.minicpmv_version = 2; // default to 2 if not set
}
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
hparams.rope_theta = 10000.0f;
} break;
case PROJECTOR_TYPE_QWEN25VL:
{
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
} break;
default:
break;
}
LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder);
LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder);
LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector);
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector);
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
}
......@@ -1298,9 +1818,6 @@ struct clip_model_loader {
vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
if (vision_model.patch_embeddings_1 == nullptr) {
ctx_clip.has_qwen2vl_merger = false;
}
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
......@@ -1314,16 +1831,28 @@ struct clip_model_loader {
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
layer.ff_o_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
// new naming
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false);
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
// legacy naming (the in and out is reversed! don't ask me why)
layer.ff_i_w = layer.ff_down_w;
layer.ff_o_w = layer.ff_up_w;
layer.ff_g_w = layer.ff_gate_w;
layer.ff_i_b = layer.ff_down_b;
layer.ff_o_b = layer.ff_up_b;
layer.ff_g_b = layer.ff_gate_b;
}
switch (ctx_clip.proj_type) {
......@@ -1388,7 +1917,7 @@ struct clip_model_loader {
vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
} break;
case PROJECTOR_TYPE_RESAMPLER:
case PROJECTOR_TYPE_MINICPMV:
{
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
......@@ -1420,10 +1949,9 @@ struct clip_model_loader {
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
} break;
case PROJECTOR_TYPE_MERGER:
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
{
vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
......@@ -1435,6 +1963,19 @@ struct clip_model_loader {
vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
vision_model.projection = get_tensor(TN_MM_PROJECTOR);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
// [IMG_BREAK] token embedding
vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
} break;
default:
GGML_ASSERT(false && "unknown projector type");
}
......@@ -1503,18 +2044,17 @@ struct clip_model_loader {
}
void alloc_compute_meta() {
ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
// create a fake batch
clip_image_f32_batch batch;
clip_image_f32_ptr img(clip_image_f32_init());
clip_image_size image_size;
image_size.width = clip_get_image_size(&ctx_clip);
image_size.height = clip_get_image_size(&ctx_clip);
int n_patches = clip_get_image_size(&ctx_clip) / image_size.width;
img->nx = n_patches;
img->ny = n_patches;
img->buf.resize(n_patches * image_size.width * image_size.height * 3);
image_size.width = ctx_clip.vision_model.hparams.image_size;
image_size.height = ctx_clip.vision_model.hparams.image_size;
img->nx = image_size.width;
img->ny = image_size.height;
img->buf.resize(image_size.width * image_size.height * 3);
batch.entries.push_back(std::move(img));
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
......@@ -1902,6 +2442,26 @@ struct image_manipulation {
}
}
// calculate the size of the **resized** image, while preserving the aspect ratio
// the calculated size will be aligned to the nearest multiple of align_size
// if H or W size is larger than max_dimension, it will be resized to max_dimension
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
return {0, 0};
}
float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
static_cast<float>(max_dimension) / inp_size.height));
float target_width_f = static_cast<float>(inp_size.width) * scale;
float target_height_f = static_cast<float>(inp_size.height) * scale;
int aligned_width = GGML_PAD((int)target_width_f, align_size);
int aligned_height = GGML_PAD((int)target_height_f, align_size);
return {aligned_width, aligned_height};
}
private:
static inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
......@@ -2194,11 +2754,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
clip_image_size original_size{img->nx, img->ny};
bool pad_to_square = true;
auto & params = ctx->vision_model.hparams;
......@@ -2219,7 +2774,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
}
return true;
}
else if (ctx->has_qwen2vl_merger) {
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
clip_image_u8 resized;
auto patch_size = clip_get_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size;
......@@ -2233,17 +2788,27 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32));
return true;
}
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
|| ctx->proj_type == PROJECTOR_TYPE_GEMMA3
|| ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
clip_image_u8 resized_image;
int sz = params.image_size;
image_manipulation::bicubic_resize(*img, resized_image, sz, sz);
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
clip_image_f32_ptr img_f32(clip_image_f32_init());
//clip_image_save_to_bmp(resized_image, "resized.bmp");
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
res_imgs->entries.push_back(std::move(img_f32));
return true;
}
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
clip_image_u8 resized_image;
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
res_imgs->entries.push_back(std::move(img_f32));
return true;
}
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
......@@ -2299,16 +2864,18 @@ void clip_free(clip_ctx * ctx) {
delete ctx;
}
// deprecated
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
const int32_t nx = ctx->vision_model.hparams.image_size;
const int32_t ny = ctx->vision_model.hparams.image_size;
return clip_embd_nbytes_by_img(ctx, nx, ny);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
......@@ -2338,21 +2905,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints.size();
}
// deprecated
int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
return clip_n_output_tokens(ctx, &img);
}
// deprecated
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
return clip_n_output_tokens(ctx, img);
}
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
}
return n_total;
}
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
}
return 1;
}
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
} else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
if (ctx->minicpmv_version == 2) {
n_patches = 96;
}
......@@ -2362,13 +2952,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
else if (ctx->minicpmv_version == 4) {
n_patches = 64;
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
else {
GGML_ABORT("Unknown minicpmv version");
}
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256;
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
n_patches /= ctx->vision_model.hparams.proj_scale_factor;
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
int n_patches_x = img->nx / params.patch_size;
int n_patches_y = img->ny / params.patch_size;
n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
}
return n_patches;
......@@ -2461,11 +3060,6 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
}
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
clip_image_f32_batch imgs;
clip_image_f32_ptr img_copy(clip_image_f32_init());
*img_copy = *img;
......@@ -2476,24 +3070,12 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
if (!ctx->has_vision_encoder) {
LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false;
}
int batch_size = imgs.entries.size();
if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
}
if (ctx->has_minicpmv_projector) {
GGML_ASSERT(batch_size == 1);
}
if (ctx->has_glm_projector) {
if (ctx->has_llava_projector
|| ctx->proj_type == PROJECTOR_TYPE_MINICPMV
|| ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
GGML_ASSERT(batch_size == 1);
ggml_tensor * boi = ctx->vision_model.boi_w;
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
}
// build the inference graph
......@@ -2502,164 +3084,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
// set inputs
const auto & model = ctx->vision_model;
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
image_size_width = imgs.entries[0]->nx;
image_size_height = imgs.entries[0]->ny;
}
const int image_size_width = imgs.entries[0]->nx;
const int image_size_height = imgs.entries[0]->ny;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int pos_w = ctx->load_image_size.width / patch_size;
const int pos_w = ctx->load_image_size.width / patch_size;
const int pos_h = ctx->load_image_size.height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) {
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
if (inp == nullptr) {
GGML_ABORT("Failed to get tensor %s", name);
}
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
GGML_ABORT("Tensor %s is not an input tensor", name);
}
return inp;
};
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
};
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
};
// set input pixel values
{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
float * data = (float *)malloc(ggml_nbytes(inp_raw));
size_t nelem = 0;
for (const auto & img : imgs.entries) {
nelem += img->nx * img->ny * 3;
}
std::vector<float> inp_raw(nelem);
// layout of data (note: the channel dim is unrolled to better visualize the layout):
//
// ┌──W──┐
// │ H │ channel = R
// ├─────┤ │
// │ H │ channel = G
// ├─────┤ │
// │ H │ channel = B
// └─────┘ │
// ──────┘ x B
for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx;
const int ny = imgs.entries[i]->ny;
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
GGML_ASSERT(nx == image_size && ny == image_size);
}
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
}
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
}
}
}
}
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
free(data);
set_input_f32("inp_raw", inp_raw);
}
if (ctx->has_minicpmv_projector) {
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
{
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
int embed_dim = 4096;
if (ctx->minicpmv_version == 2) {
embed_dim = 4096;
}
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
else if (ctx->minicpmv_version == 4) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
for(int i=0;i < pos_w * pos_h; ++i){
for(int j=0; j < embed_dim; ++j){
pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
// set input per projector
switch (ctx->proj_type) {
case PROJECTOR_TYPE_MINICPMV:
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
std::vector<int32_t> positions(pos_h * pos_w);
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
set_input_i32("positions", positions);
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data);
}
}
else {
if (model.class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
int embed_dim = clip_n_mmproj_embd(ctx);
void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem);
}
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
if (ctx->has_qwen2vl_merger) {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
for(int i = 0; i < pos_w * pos_h; ++i){
for(int j = 0; j < embed_dim; ++j){
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
}
}
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
int* positions_data = (int*)malloc(ggml_nbytes(positions));
set_input_f32("pos_embed", pos_embed);
} break;
case PROJECTOR_TYPE_QWEN2VL:
{
const int merge_ratio = 2;
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
std::vector<int> positions(num_positions * 4);
int ptr = 0;
for (int y = 0; y < ph; y += merge_ratio) {
for (int x = 0; x < pw; x += merge_ratio) {
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions[ ptr] = y + dy;
positions[ num_patches + ptr] = x + dx;
positions[2 * num_patches + ptr] = y + dy;
positions[3 * num_patches + ptr] = x + dx;
ptr++;
}
}
}
}
int ptr = 0;
for (int y = 0; y < ph; y+=2)
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_QWEN25VL:
{
for (int x = 0; x < pw; x+=2)
{
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions_data[ptr] = y + dy;
positions_data[num_patches + ptr] = x + dx;
positions_data[num_patches * 2 + ptr] = y + dy;
positions_data[num_patches * 3 + ptr] = x + dx;
ptr++;
// pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT
const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
const int ipw = image_size_width / patch_size;
const int iph = image_size_height / patch_size;
std::vector<int> idx (ph * pw);
std::vector<int> inv_idx(ph * pw);
if (use_window_attn) {
const int attn_window_size = 112;
const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
int mask_row = 0;
for (int y = 0; y < ph; y += grid_window) {
for (int x = 0; x < pw; x += grid_window) {
const int win_h = std::min(grid_window, ph - y);
const int win_w = std::min(grid_window, pw - x);
const int dst_0 = dst;
// group all tokens belong to the same window togather (to a continue range)
for (int dy = 0; dy < win_h; dy++) {
for (int dx = 0; dx < win_w; dx++) {
const int src = (y + dy) * pw + (x + dx);
GGML_ASSERT(src < (int)idx.size());
GGML_ASSERT(dst < (int)inv_idx.size());
idx [src] = dst;
inv_idx[dst] = src;
dst++;
}
}
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
int row_offset = mask_row * (ipw * iph);
std::fill(
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
0.0);
mask_row++;
}
}
}
set_input_i32("window_idx", idx);
set_input_i32("inv_window_idx", inv_idx);
set_input_f32("window_mask", mask);
} else {
for (int i = 0; i < ph * pw; i++) {
idx[i] = i;
}
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
// do nothing
}
else {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
const int mpow = merge_ratio * merge_ratio;
std::vector<int> positions(num_positions * 4);
int ptr = 0;
for (int y = 0; y < iph; y += merge_ratio) {
for (int x = 0; x < ipw; x += merge_ratio) {
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
auto remap = idx[ptr / mpow];
remap = (remap * mpow) + (ptr % mpow);
positions[ remap] = y + dy;
positions[ num_patches + remap] = x + dx;
positions[2 * num_patches + remap] = y + dy;
positions[3 * num_patches + remap] = x + dx;
ptr++;
}
}
}
}
int* positions_data = (int*)malloc(ggml_nbytes(positions));
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
std::vector<int> pos_data(num_positions);
// dimension H
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i / n_patches_per_col;
}
set_input_i32("pos_h", pos_data);
// dimension W
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i % n_patches_per_col;
}
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_GLM_EDGE:
{
// llava and other models
std::vector<int32_t> positions(num_positions);
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
positions[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
case PROJECTOR_TYPE_LDPV2:
{
// llava and other models
std::vector<int32_t> positions(num_positions);
for (int i = 0; i < num_positions; i++) {
positions[i] = i;
}
set_input_i32("positions", positions);
if (!ctx->has_glm_projector) {
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
// The patches vector is used to get rows to index into the embeds with;
// we should skip dim 0 only if we have CLS to avoid going out of bounds
// when retrieving the rows.
int patch_offset = model.class_embedding ? 1 : 0;
int* patches_data = (int*)malloc(ggml_nbytes(patches));
std::vector<int32_t> patches(num_patches);
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + patch_offset;
patches[i] = i + patch_offset;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
}
}
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
{
// do nothing
} break;
default:
GGML_ABORT("Unknown projector type");
}
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
......@@ -2676,13 +3377,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
if (ctx->has_glm_projector) {
//eoi
ggml_tensor * eoi = ctx->vision_model.eoi_w;
int offset = ggml_nelements(embeddings);
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
}
return true;
}
......@@ -2822,56 +3516,52 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
}
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
return ctx->vision_model.mm_model_peg_0_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
return ctx->vision_model.mm_3_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (ctx->minicpmv_version == 2) {
return 4096;
}
else if (ctx->minicpmv_version == 3) {
return 3584;
}
else if (ctx->minicpmv_version == 4) {
return 3584;
}
}
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
}
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
return ctx->vision_model.mm_input_proj_w->ne[0];
switch (ctx->proj_type) {
case PROJECTOR_TYPE_LDP:
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
case PROJECTOR_TYPE_LDPV2:
return ctx->vision_model.mm_model_peg_0_b->ne[0];
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_PIXTRAL:
return ctx->vision_model.mm_2_b->ne[0];
case PROJECTOR_TYPE_MLP_NORM:
return ctx->vision_model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:
if (ctx->minicpmv_version == 2) {
return 4096;
} else if (ctx->minicpmv_version == 3) {
return 3584;
} else if (ctx->minicpmv_version == 4) {
return 3584;
}
GGML_ABORT("Unknown minicpmv version");
case PROJECTOR_TYPE_GLM_EDGE:
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
return ctx->vision_model.mm_1_b->ne[0];
case PROJECTOR_TYPE_GEMMA3:
return ctx->vision_model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3:
return ctx->vision_model.projection->ne[1];
default:
GGML_ABORT("Unknown projector type");
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
int clip_is_minicpmv(const struct clip_ctx * ctx) {
if (ctx->has_minicpmv_projector) {
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
return ctx->minicpmv_version;
}
return 0;
}
bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->has_glm_projector;
return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
}
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger;
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
}
bool clip_is_llava(const struct clip_ctx * ctx) {
......@@ -2882,29 +3572,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
}
// Determine the number of encoder layers to iterate over
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
// Get the index of the second to last layer; this is the
// default for models that have a llava projector
const auto & hparams = ctx->vision_model.hparams;
int n_layer = hparams.n_layer - 1;
int deepest_feature_layer = -1;
// Handle other projectors; incrementing here indicates that we
// should use the last encoder layer for the vision features.
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
n_layer += 1;
}
// If we set explicit vision feature layers, only go up to the deepest one
for (const auto & feature_layer : hparams.vision_feature_layer) {
if (feature_layer > deepest_feature_layer) {
deepest_feature_layer = feature_layer;
}
}
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
......
......@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
......@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
"use clip_n_output_tokens instead");
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
"use clip_n_output_tokens instead");
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// this should be equal to the embedding dimension of the text model
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
......@@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
......
......@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
struct {
struct ggml_context * ctx;
} model;
......@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params);
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
......@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
......@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
......@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) {
LOG_ERR("Unable to encode image\n");
......@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
......
......@@ -111,6 +111,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
};
enum llama_rope_type {
......@@ -1237,6 +1238,7 @@ extern "C" {
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
......
......@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
......@@ -73,7 +74,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
......@@ -109,6 +109,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
......@@ -511,6 +512,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_NOMIC_BERT_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_JINA_BERT_V2,
{
......@@ -1587,22 +1606,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
{
LLM_ARCH_MISTRAL3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
}
},
{
LLM_ARCH_UNKNOWN,
{
......
......@@ -24,6 +24,7 @@ enum llm_arch {
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
......@@ -75,7 +76,6 @@ enum llm_arch {
LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_MISTRAL3,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN,
......@@ -113,6 +113,7 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
......
......@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
......@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
};
llm_chat_template llm_chat_template_from_str(const std::string & name) {
......@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
if (tmpl_contains("<|im_start|>")) {
return tmpl_contains("<|im_sep|>")
? LLM_CHAT_TEMPLATE_PHI_4
: LLM_CHAT_TEMPLATE_CHATML;
: tmpl_contains("<end_of_utterance>")
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
: LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
......@@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
}
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
return LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) {
......@@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains("[gMASK]sop")) {
// chatglm3-6b
return LLM_CHAT_TEMPLATE_CHATGML_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGML_4;
return LLM_CHAT_TEMPLATE_CHATGLM_3;
} else if (tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
return LLM_CHAT_TEMPLATE_MINICPM;
......@@ -432,7 +437,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
// chatglm3-6b
ss << "[gMASK]" << "sop";
for (auto message : chat) {
......@@ -442,7 +447,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
......@@ -451,14 +456,6 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) {
......@@ -620,7 +617,23 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|header_start|>assistant<|header_end|>\n\n";
}
} else {
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
// SmolVLM
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "\n\n";
} else if (role == "user") {
ss << "User: " << message->content << "<end_of_utterance>\n";
} else {
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else {
// template not supported
return -1;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment