Commit 312d9de1 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llama: Improve error handling

Check for NULL return values from llama.cpp in more places and
convert them into Go errors, which should make debugging easier
in the future rather than having hidden surprises in our data
structures.
parent a103dae0
......@@ -88,6 +88,7 @@ import (
"fmt"
"runtime"
"runtime/cgo"
"slices"
"strings"
"unsafe"
)
......@@ -260,7 +261,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
}
m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
if m.c == (*C.struct_llama_model)(C.NULL) {
if m.c == nil {
return nil, fmt.Errorf("unable to load model: %s", modelPath)
}
......@@ -276,7 +277,7 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
c: C.llama_new_context_with_model(model.c, params.c),
numThreads: int(params.c.n_threads),
}
if c.c == (*C.struct_llama_context)(C.NULL) {
if c.c == nil {
return nil, errors.New("unable to create llama context")
}
......@@ -300,6 +301,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
defer C.free(unsafe.Pointer(cLoraPath))
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
if loraAdapter == nil {
return errors.New("unable to load lora")
}
err := -1
if loraAdapter != nil {
......@@ -322,13 +326,25 @@ type Batch struct {
// Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero).
// Batches cannot contain both types at the same time. batchSize is the maximum number of entries
// that can be added per sequence
func NewBatch(batchSize int, maxSeq int, embedSize int) *Batch {
return &Batch{
func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) {
b := Batch{
c: C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)),
batchSize: batchSize,
maxSeq: maxSeq,
embedSize: embedSize,
}
// Check to see if any of the allocations in llama_batch_init() failed
nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) ||
b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil ||
slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil)
if nilPointer {
C.llama_batch_free(b.c)
return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize)
}
return &b, nil
}
func (b *Batch) Size() int {
......@@ -484,6 +500,9 @@ func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, erro
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
c := C.clip_model_load(mp, 1)
if c == nil {
return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
}
projEmbedSize := int(C.clip_n_mmproj_embd(c))
modelEmbedSize := llamaContext.Model().NEmbd()
......@@ -498,8 +517,11 @@ func (c *ClipContext) Free() {
C.clip_free(c.c)
}
func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 {
func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
if l == nil {
return nil, errors.New("unable to make llava embedding from image")
}
numTokens := int(l.n_image_pos)
numEmbed := llamaContext.Model().NEmbd()
......@@ -516,7 +538,7 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 {
C.llava_image_embed_free(l)
return embed
return embed, nil
}
type MllamaContext struct {
......@@ -527,6 +549,9 @@ func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext,
mp := C.CString(modelPath)
defer C.free(unsafe.Pointer(mp))
c := C.mllama_model_load(mp, 1)
if c == nil {
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
}
projEmbedSize := int(C.mllama_n_embd(c))
modelEmbedSize := llamaContext.Model().NEmbd()
......@@ -541,19 +566,25 @@ func (m *MllamaContext) Free() {
C.mllama_free(m.c)
}
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) [][]float32 {
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
img := C.mllama_image_init()
defer C.mllama_image_free(img)
C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
if !ok {
return nil, errors.New("unable to load mllama image data")
}
rows := make([]float32, m.EmbedSize(llamaContext))
C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
if !ok {
return nil, errors.New("unable to make mllama embedding from image")
}
embed := make([][]float32, 1)
embed[0] = rows
return embed
return embed, nil
}
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
......@@ -592,7 +623,7 @@ type SamplingParams struct {
Grammar string
}
func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
var cparams C.struct_gpt_sampler_cparams
cparams.top_k = C.int32_t(params.TopK)
cparams.top_p = C.float(params.TopP)
......@@ -615,9 +646,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
cparams.grammar = grammar
context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
if context.c == nil {
return nil, errors.New("unable to create sampling context")
}
runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
return context
return context, nil
}
func (s *SamplingContext) Reset() {
......
......@@ -63,9 +63,9 @@ func (c *ImageContext) Free(modelPath string) {
}
}
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) [][]float32 {
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
if c == nil {
return nil
return nil, nil
}
hash := c.hashImage(data)
......@@ -76,17 +76,23 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
embed, err := c.findImage(hash)
if err != nil {
if c.mllama != nil {
embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
if err != nil {
return nil, err
}
} else if c.clip != nil {
embed = c.clip.NewEmbed(llamaContext, data)
embed, err = c.clip.NewEmbed(llamaContext, data)
if err != nil {
return nil, err
}
} else {
return nil
return nil, errors.New("received image but vision model not loaded")
}
c.addImage(hash, embed)
}
return embed
return embed, nil
}
func (c *ImageContext) BatchSize(configuredBatchSize int) int {
......
......@@ -131,7 +131,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
var sc *llama.SamplingContext
if params.samplingParams != nil {
sc = llama.NewSamplingContext(s.model, *params.samplingParams)
sc, err = llama.NewSamplingContext(s.model, *params.samplingParams)
if err != nil {
return nil, err
}
for _, input := range inputs {
if input.embed == nil {
sc.Accept(input.token, false)
......@@ -194,7 +197,11 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
return nil, fmt.Errorf("invalid image index: %d", n)
}
embed := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
if err != nil {
return nil, err
}
for _, e := range embed {
inputs = append(inputs, input{embed: e})
}
......@@ -305,13 +312,19 @@ func (s *Server) run(ctx context.Context) {
// Logically these batches are used only within the context of processBatch
// but it is better for performance to allocate them once here
tokenBatch := llama.NewBatch(s.batchSize, len(s.seqs), 0)
tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0)
if err != nil {
panic(err)
}
defer tokenBatch.Free()
var embedBatch *llama.Batch
embedBatchSize := s.image.BatchSize(s.batchSize)
if embedBatchSize != 0 {
embedBatch = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
if err != nil {
panic(err)
}
defer embedBatch.Free()
} else {
embedBatch = &llama.Batch{}
......
......@@ -5,24 +5,28 @@
struct gpt_sampler *gpt_sampler_cinit(
const struct llama_model *model, struct gpt_sampler_cparams *params)
{
gpt_sampler_params sparams;
sparams.top_k = params->top_k;
sparams.top_p = params->top_p;
sparams.min_p = params->min_p;
sparams.tfs_z = params->tfs_z;
sparams.typ_p = params->typical_p;
sparams.temp = params->temp;
sparams.penalty_last_n = params->penalty_last_n;
sparams.penalty_repeat = params->penalty_repeat;
sparams.penalty_freq = params->penalty_freq;
sparams.penalty_present = params->penalty_present;
sparams.mirostat = params->mirostat;
sparams.mirostat_tau = params->mirostat_tau;
sparams.mirostat_eta = params->mirostat_eta;
sparams.penalize_nl = params->penalize_nl;
sparams.seed = params->seed;
sparams.grammar = params->grammar;
return gpt_sampler_init(model, sparams);
try {
gpt_sampler_params sparams;
sparams.top_k = params->top_k;
sparams.top_p = params->top_p;
sparams.min_p = params->min_p;
sparams.tfs_z = params->tfs_z;
sparams.typ_p = params->typical_p;
sparams.temp = params->temp;
sparams.penalty_last_n = params->penalty_last_n;
sparams.penalty_repeat = params->penalty_repeat;
sparams.penalty_freq = params->penalty_freq;
sparams.penalty_present = params->penalty_present;
sparams.mirostat = params->mirostat;
sparams.mirostat_tau = params->mirostat_tau;
sparams.mirostat_eta = params->mirostat_eta;
sparams.penalize_nl = params->penalize_nl;
sparams.seed = params->seed;
sparams.grammar = params->grammar;
return gpt_sampler_init(model, sparams);
} catch (const std::exception & err) {
return nullptr;
}
}
void gpt_sampler_cfree(struct gpt_sampler *sampler)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment