Add experimental MLX backend and engine with imagegen support (#13648)

* WIP - MLX backend with gemma3 * MLX: add cmake and go tag build toggles To build the new MLX backend code: cmake --preset MLX cmake --build --preset MLX --parallel cmake --install build --component MLX go build -tags mlx . Note: the main.go entrypoint for the MLX engine will change in a follow up commit. * add experimental image generation runtime * add experimental image generation runtime * MLX: wire up cuda build for linux * MLX: get dependencies correct and dedup This is still too large for a unified github artifact, but is now "correct" for the mlx_cuda_v13 directory. * fix relative link bug in dedup * Add darwin build and readme * add go build tag for mlx dependent code and wire up build_darwin.sh * lint cleanup * macos: build mlx for x86 This will be CPU only. * cuda build instructions and fix drift from mlx bump * stale comment * Delete agent helper doc * Clean up readme.md * Revise README for tokenizer clarity and details Updated README to clarify tokenizer functionality and removed correctness section. --------- Co-authored-by: jmorganca <jmorganca@gmail.com>

Add experimental MLX backend and engine with imagegen support (#13648)
* WIP - MLX backend with gemma3 * MLX: add cmake and go tag build toggles To build the new MLX backend code: cmake --preset MLX cmake --build --preset MLX --parallel cmake --install build --component MLX go build -tags mlx . Note: the main.go entrypoint for the MLX engine will change in a follow up commit. * add experimental image generation runtime * add experimental image generation runtime * MLX: wire up cuda build for linux * MLX: get dependencies correct and dedup This is still too large for a unified github artifact, but is now "correct" for the mlx_cuda_v13 directory. * fix relative link bug in dedup * Add darwin build and readme * add go build tag for mlx dependent code and wire up build_darwin.sh * lint cleanup * macos: build mlx for x86 This will be CPU only. * cuda build instructions and fix drift from mlx bump * stale comment * Delete agent helper doc * Clean up readme.md * Revise README for tokenizer clarity and details Updated README to clarify tokenizer functionality and removed correctness section. --------- Co-authored-by: jmorganca <jmorganca@gmail.com>
33ee7168 · Daniel Hiltgen · GitHub · 34d0c55e · 33ee7168 · 33ee7168
Unverified Commit 33ee7168 authored Jan 08, 2026 by Daniel Hiltgen Committed by GitHub Jan 08, 2026
20 changed files
--- a/x/imagegen/cache/cache.go
+++ b/x/imagegen/cache/cache.go
+//go:build mlx
+
+package cache
+
+import "github.com/ollama/ollama/x/imagegen/mlx"
+
+type Cache interface {
+	Update(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array)
+	Offset() int
+	Len() int
+	State() []*mlx.Array
+}
+
+type KVCache struct {
+	keys, values *mlx.Array
+	offset       int
+	step         int
+}
+
+func NewKVCache() *KVCache {
+	return &KVCache{step: 256}
+}
+
+func (c *KVCache) Update(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array) {
+	prev := c.offset
+	shape := k.Shape()
+	B, H, Dk := shape[0], shape[1], shape[3]
+	Dv := v.Shape()[3]
+
+	// Grow buffer if needed
+	if c.keys == nil || (prev+seqLen) > int(c.keys.Shape()[2]) {
+		nSteps := (c.step + seqLen - 1) / c.step
+		newK := mlx.Zeros([]int32{B, H, int32(nSteps * c.step), Dk}, k.Dtype())
+		newV := mlx.Zeros([]int32{B, H, int32(nSteps * c.step), Dv}, v.Dtype())
+
+		if c.keys != nil {
+			if prev%c.step != 0 {
+				c.keys = mlx.Slice(c.keys, []int32{0, 0, 0, 0}, []int32{B, H, int32(prev), Dk})
+				c.values = mlx.Slice(c.values, []int32{0, 0, 0, 0}, []int32{B, H, int32(prev), Dv})
+			}
+			c.keys = mlx.Concatenate([]*mlx.Array{c.keys, newK}, 2)
+			c.values = mlx.Concatenate([]*mlx.Array{c.values, newV}, 2)
+		} else {
+			c.keys, c.values = newK, newV
+		}
+	}
+
+	c.offset += seqLen
+	c.keys = mlx.SliceUpdateInplace(c.keys, k, []int32{0, 0, int32(prev), 0}, []int32{B, H, int32(c.offset), Dk})
+	c.values = mlx.SliceUpdateInplace(c.values, v, []int32{0, 0, int32(prev), 0}, []int32{B, H, int32(c.offset), Dv})
+
+	return mlx.Slice(c.keys, []int32{0, 0, 0, 0}, []int32{B, H, int32(c.offset), Dk}),
+		mlx.Slice(c.values, []int32{0, 0, 0, 0}, []int32{B, H, int32(c.offset), Dv})
+}
+
+func (c *KVCache) State() []*mlx.Array {
+	if c.keys == nil {
+		return nil
+	}
+	return []*mlx.Array{c.keys, c.values}
+}
+
+func (c *KVCache) Offset() int { return c.offset }
+func (c *KVCache) Len() int    { return c.offset }
+
+// RotatingKVCache implements sliding window attention with bounded memory
+type RotatingKVCache struct {
+	keys, values *mlx.Array
+	offset       int
+	maxSize      int
+	step         int
+	idx          int
+}
+
+func NewRotatingKVCache(maxSize int) *RotatingKVCache {
+	return &RotatingKVCache{maxSize: maxSize, step: 256}
+}
+
+func (c *RotatingKVCache) Update(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array) {
+	if seqLen > 1 {
+		return c.updateConcat(k, v, seqLen)
+	}
+	return c.updateInPlace(k, v)
+}
+
+func (c *RotatingKVCache) updateInPlace(k, v *mlx.Array) (*mlx.Array, *mlx.Array) {
+	shape := k.Shape()
+	B, H, Dk := shape[0], shape[1], shape[3]
+	Dv := v.Shape()[3]
+
+	// Grow buffer if not yet at max
+	if c.keys == nil || (c.idx >= int(c.keys.Shape()[2]) && int(c.keys.Shape()[2]) < c.maxSize) {
+		var cap int
+		if c.keys != nil {
+			cap = int(c.keys.Shape()[2])
+		}
+		newSize := min(c.step, c.maxSize-cap)
+		newK := mlx.Zeros([]int32{B, H, int32(newSize), Dk}, k.Dtype())
+		newV := mlx.Zeros([]int32{B, H, int32(newSize), Dv}, v.Dtype())
+		if c.keys != nil {
+			c.keys = mlx.Concatenate([]*mlx.Array{c.keys, newK}, 2)
+			c.values = mlx.Concatenate([]*mlx.Array{c.values, newV}, 2)
+		} else {
+			c.keys, c.values = newK, newV
+		}
+	}
+
+	// Rotate when hitting max
+	if c.idx >= c.maxSize {
+		c.idx = 0
+	}
+
+	c.keys = mlx.SliceUpdateInplace(c.keys, k, []int32{0, 0, int32(c.idx), 0}, []int32{B, H, int32(c.idx + 1), Dk})
+	c.values = mlx.SliceUpdateInplace(c.values, v, []int32{0, 0, int32(c.idx), 0}, []int32{B, H, int32(c.idx + 1), Dv})
+
+	c.offset++
+	c.idx++
+
+	validLen := int32(min(c.offset, c.maxSize))
+	return mlx.Slice(c.keys, []int32{0, 0, 0, 0}, []int32{B, H, validLen, Dk}),
+		mlx.Slice(c.values, []int32{0, 0, 0, 0}, []int32{B, H, validLen, Dv})
+}
+
+func (c *RotatingKVCache) updateConcat(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array) {
+	shape := k.Shape()
+	B, H, Dk := shape[0], shape[1], shape[3]
+	Dv := v.Shape()[3]
+
+	if c.keys == nil {
+		c.keys, c.values = k, v
+	} else {
+		c.keys = mlx.Concatenate([]*mlx.Array{c.keys, k}, 2)
+		c.values = mlx.Concatenate([]*mlx.Array{c.values, v}, 2)
+	}
+	c.offset += seqLen
+
+	// Trim to max_size to maintain sliding window
+	cap := int(c.keys.Shape()[2])
+	if trim := cap - c.maxSize; trim > 0 {
+		c.keys = mlx.Slice(c.keys, []int32{0, 0, int32(trim), 0}, []int32{B, H, int32(cap), Dk})
+		c.values = mlx.Slice(c.values, []int32{0, 0, int32(trim), 0}, []int32{B, H, int32(cap), Dv})
+	}
+
+	c.idx = int(c.keys.Shape()[2])
+	return c.keys, c.values
+}
+
+func (c *RotatingKVCache) State() []*mlx.Array {
+	if c.keys == nil {
+		return nil
+	}
+	return []*mlx.Array{c.keys, c.values}
+}
+
+func (c *RotatingKVCache) Offset() int { return c.offset }
+func (c *RotatingKVCache) Len() int    { return min(c.offset, c.maxSize) }
--- a/x/imagegen/cache/step.go
+++ b/x/imagegen/cache/step.go
+//go:build mlx
+
+package cache
+
+import "github.com/ollama/ollama/x/imagegen/mlx"
+
+// StepCache caches layer outputs across diffusion denoising steps.
+// Based on DeepCache (CVPR 2024) and Learning-to-Cache (NeurIPS 2024):
+// shallow layers change little between consecutive steps, so we can
+// cache their outputs and skip recomputation on non-refresh steps.
+//
+// Supports both single-stream (Z-Image) and dual-stream (Qwen-Image) architectures:
+//   - Single-stream: use Get/Set for the single output per layer
+//   - Dual-stream: use Get/Set for stream 1 (imgH), Get2/Set2 for stream 2 (txtH)
+//
+// Usage (single-stream):
+//
+//	cache := NewStepCache(15)  // cache first 15 layers
+//	for step := 0; step < numSteps; step++ {
+//	    refresh := cache.ShouldRefresh(step, 3)  // refresh every 3 steps
+//	    for i, layer := range layers {
+//	        if i < 15 && !refresh && cache.Get(i) != nil {
+//	            output = cache.Get(i)  // reuse cached
+//	        } else {
+//	            output = layer.Forward(input)
+//	            if i < 15 && refresh {
+//	                cache.Set(i, output)
+//	            }
+//	        }
+//	    }
+//	}
+//	cache.Free()  // cleanup when done
+//
+// Usage (dual-stream):
+//
+//	cache := NewStepCache(15)
+//	for step := 0; step < numSteps; step++ {
+//	    refresh := cache.ShouldRefresh(step, 3)
+//	    for i, layer := range layers {
+//	        if i < 15 && !refresh && cache.Get(i) != nil {
+//	            imgH, txtH = cache.Get(i), cache.Get2(i)
+//	        } else {
+//	            imgH, txtH = layer.Forward(imgH, txtH, ...)
+//	            if i < 15 && refresh {
+//	                cache.Set(i, imgH)
+//	                cache.Set2(i, txtH)
+//	            }
+//	        }
+//	    }
+//	}
+type StepCache struct {
+	layers   []*mlx.Array // cached layer outputs (stream 1)
+	layers2  []*mlx.Array // cached layer outputs (stream 2, for dual-stream models)
+	constant *mlx.Array   // optional constant (e.g., text embeddings)
+}
+
+// NewStepCache creates a cache for the given number of layers.
+func NewStepCache(numLayers int) *StepCache {
+	return &StepCache{
+		layers:  make([]*mlx.Array, numLayers),
+		layers2: make([]*mlx.Array, numLayers),
+	}
+}
+
+// ShouldRefresh returns true if the cache should be refreshed at this step.
+// Refresh happens on step 0, interval, 2*interval, etc.
+func (c *StepCache) ShouldRefresh(step, interval int) bool {
+	return step%interval == 0
+}
+
+// Get returns the cached output for a layer, or nil if not cached.
+func (c *StepCache) Get(layer int) *mlx.Array {
+	if layer < len(c.layers) {
+		return c.layers[layer]
+	}
+	return nil
+}
+
+// Set stores a layer output (stream 1), freeing any previous value.
+func (c *StepCache) Set(layer int, arr *mlx.Array) {
+	if layer < len(c.layers) {
+		if c.layers[layer] != nil {
+			c.layers[layer].Free()
+		}
+		c.layers[layer] = arr
+	}
+}
+
+// Get2 returns the cached output for a layer (stream 2), or nil if not cached.
+// Used for dual-stream architectures like Qwen-Image.
+func (c *StepCache) Get2(layer int) *mlx.Array {
+	if layer < len(c.layers2) {
+		return c.layers2[layer]
+	}
+	return nil
+}
+
+// Set2 stores a layer output (stream 2), freeing any previous value.
+// Used for dual-stream architectures like Qwen-Image.
+func (c *StepCache) Set2(layer int, arr *mlx.Array) {
+	if layer < len(c.layers2) {
+		if c.layers2[layer] != nil {
+			c.layers2[layer].Free()
+		}
+		c.layers2[layer] = arr
+	}
+}
+
+// GetConstant returns the cached constant value.
+func (c *StepCache) GetConstant() *mlx.Array {
+	return c.constant
+}
+
+// SetConstant stores a constant value, freeing any previous value.
+func (c *StepCache) SetConstant(arr *mlx.Array) {
+	if c.constant != nil {
+		c.constant.Free()
+	}
+	c.constant = arr
+}
+
+// Arrays returns all non-nil cached arrays (for pool.Keep).
+func (c *StepCache) Arrays() []*mlx.Array {
+	var result []*mlx.Array
+	if c.constant != nil {
+		result = append(result, c.constant)
+	}
+	for _, arr := range c.layers {
+		if arr != nil {
+			result = append(result, arr)
+		}
+	}
+	for _, arr := range c.layers2 {
+		if arr != nil {
+			result = append(result, arr)
+		}
+	}
+	return result
+}
+
+// Free releases all cached arrays. Call when generation completes.
+func (c *StepCache) Free() {
+	if c.constant != nil {
+		c.constant.Free()
+		c.constant = nil
+	}
+	for i, arr := range c.layers {
+		if arr != nil {
+			arr.Free()
+			c.layers[i] = nil
+		}
+	}
+	for i, arr := range c.layers2 {
+		if arr != nil {
+			arr.Free()
+			c.layers2[i] = nil
+		}
+	}
+}
+
+// NumLayers returns the number of layers this cache can store.
+func (c *StepCache) NumLayers() int {
+	return len(c.layers)
+}
--- a/x/imagegen/cmd/engine/generate.go
+++ b/x/imagegen/cmd/engine/generate.go
+//go:build mlx
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"time"
+	"unicode/utf8"
+
+	"github.com/ollama/ollama/x/imagegen/cache"
+	"github.com/ollama/ollama/x/imagegen/mlx"
+	"github.com/ollama/ollama/x/imagegen/tokenizer"
+)
+
+// Dedicated stream for generation (like mlx-lm's generation_stream)
+var generationStream *mlx.Stream
+
+// utf8Streamer buffers decoded text and emits only complete UTF-8 characters.
+// This handles cases where tokenizers output partial multi-byte sequences.
+type utf8Streamer struct {
+	buffer []byte
+}
+
+// Write adds decoded text to the buffer and returns complete UTF-8 characters.
+func (s *utf8Streamer) Write(text string) string {
+	s.buffer = append(s.buffer, text...)
+
+	// Find the last position that ends with a complete UTF-8 character
+	validLen := 0
+	for i := 0; i < len(s.buffer); {
+		r, size := utf8.DecodeRune(s.buffer[i:])
+		if r == utf8.RuneError && size == 1 {
+			// Invalid or incomplete UTF-8 sequence at this position
+			// Check if it could be a valid start of a multi-byte sequence
+			if len(s.buffer)-i < 4 {
+				// Might be incomplete, keep it in buffer
+				break
+			}
+			// Definitely invalid, skip this byte
+			i++
+			validLen = i
+		} else {
+			i += size
+			validLen = i
+		}
+	}
+
+	if validLen == 0 {
+		return ""
+	}
+
+	result := string(s.buffer[:validLen])
+	s.buffer = s.buffer[validLen:]
+	return result
+}
+
+// Flush returns any remaining buffered bytes (may be incomplete UTF-8).
+func (s *utf8Streamer) Flush() string {
+	if len(s.buffer) == 0 {
+		return ""
+	}
+	result := string(s.buffer)
+	s.buffer = nil
+	return result
+}
+
+func init() {
+	generationStream = mlx.NewStream()
+}
+
+// withStream runs fn with the generation stream as default
+func withStream(fn func()) {
+	orig := mlx.GetDefaultStream()
+	mlx.SetDefaultStream(generationStream)
+	fn()
+	mlx.SetDefaultStream(orig)
+}
+
+type Model interface {
+	Tokenizer() *tokenizer.Tokenizer
+	VocabSize() int32
+	NewCache(maxSeqLen int32) []cache.Cache
+	Forward(input *mlx.Array, caches []cache.Cache) *mlx.Array
+}
+
+// ChatModel is an optional interface for models that support chat formatting
+type ChatModel interface {
+	FormatPrompt(prompt string) string
+}
+
+// MultimodalModel is for models that support image input
+type MultimodalModel interface {
+	Model
+	FormatPromptWithImage(prompt string) string
+	ExpandImageTokens(tokens []int32) []int32
+	ForwardWithImage(tokens *mlx.Array, image *mlx.Array, caches []cache.Cache) *mlx.Array
+	ImageSize() int32 // Returns expected image size for preprocessing
+}
+
+// ImageLoader loads and preprocesses an image for multimodal models
+// Returns nil if path is empty
+type ImageLoader func(path string, imageSize int32) (*mlx.Array, error)
+
+type input struct {
+	Prompt       string
+	Image        *mlx.Array // Optional preprocessed image for multimodal models
+	MaxTokens    int
+	Temperature  float32
+	TopP         float32
+	TopK         int
+	WiredLimitGB int // Metal wired memory limit in GB (default 32)
+}
+
+type output struct {
+	Text          string
+	Done          bool
+	PrefillTokSec float64
+	GenTokSec     float64
+}
+
+// Decoder wraps model + cache for autoregressive generation.
+type Decoder struct {
+	model         Model
+	caches        []cache.Cache
+	vocabSize     int32
+	temp          float32
+	topK          int
+	topP          float32
+	token         *mlx.Array   // Current token (kept across pools)
+	oldCacheState []*mlx.Array // Preallocated slice for old cache state
+	image         *mlx.Array   // Optional image for multimodal prefill
+}
+
+func NewDecoder(m Model, temp float32, topK int, topP float32) *Decoder {
+	caches := m.NewCache(0)
+	return &Decoder{
+		model:         m,
+		caches:        caches,
+		vocabSize:     m.VocabSize(),
+		temp:          temp,
+		topK:          topK,
+		topP:          topP,
+		oldCacheState: make([]*mlx.Array, 0, len(caches)*2),
+	}
+}
+
+// SetImage sets the image for multimodal prefill (call before prefill)
+func (d *Decoder) SetImage(img *mlx.Array) {
+	d.image = img
+}
+
+func (d *Decoder) prefill(inputIDs []int32) int {
+	processed := 0
+
+	// Track old cache state to free after each chunk
+	var oldCacheState []*mlx.Array
+
+	// For multimodal models with an image, we need to process all tokens together
+	// in the first forward pass so the image embeddings can be inserted properly.
+	// Skip chunking for multimodal prefill.
+	isMultimodal := d.image != nil
+
+	// Process all-but-1 tokens in chunks, eval cache state for memory management
+	// Skip chunking for multimodal - process everything in the final step
+	if !isMultimodal {
+		for len(inputIDs) > 1 {
+			chunkSize := min(2048, len(inputIDs)-1)
+			if chunkSize <= 0 {
+				break
+			}
+			chunk := inputIDs[:chunkSize]
+
+			// Save old cache state before forward
+			oldCacheState = oldCacheState[:0]
+			for _, c := range d.caches {
+				oldCacheState = append(oldCacheState, c.State()...)
+			}
+
+			var cacheState []*mlx.Array
+			withStream(func() {
+				x := mlx.NewArrayInt32(chunk, []int32{1, int32(len(chunk))})
+				d.model.Forward(x, d.caches)
+				for _, c := range d.caches {
+					cacheState = append(cacheState, c.State()...)
+				}
+			})
+			mlx.Eval(cacheState...)
+
+			// Free old cache state
+			for _, arr := range oldCacheState {
+				if arr != nil {
+					arr.Free()
+				}
+			}
+
+			inputIDs = inputIDs[chunkSize:]
+			processed += chunkSize
+		}
+	}
+
+	// Save old cache state before final step
+	oldCacheState = oldCacheState[:0]
+	for _, c := range d.caches {
+		oldCacheState = append(oldCacheState, c.State()...)
+	}
+
+	// Final token + sampling (or all tokens for multimodal)
+	withStream(func() {
+		x := mlx.NewArrayInt32(inputIDs, []int32{1, int32(len(inputIDs))})
+		mlx.Eval(x) // Materialize before any other evals
+
+		var logits *mlx.Array
+		// Use ForwardWithImage if we have an image and model supports it
+		if d.image != nil {
+			if mm, ok := d.model.(MultimodalModel); ok {
+				logits = mm.ForwardWithImage(x, d.image, d.caches)
+				d.image = nil // Only use image for first forward
+			} else {
+				logits = d.model.Forward(x, d.caches)
+			}
+		} else {
+			logits = d.model.Forward(x, d.caches)
+		}
+		d.token = sample(logits, d.temp, d.topK, d.topP, d.vocabSize)
+	})
+	// Keep cache state (token auto-kept by AsyncEval)
+	for _, c := range d.caches {
+		mlx.Keep(c.State()...)
+	}
+	mlx.AsyncEval(d.token)
+
+	// Free old cache state from before final step
+	for _, arr := range oldCacheState {
+		if arr != nil {
+			arr.Free()
+		}
+	}
+
+	mlx.ClearCache()
+
+	return processed + len(inputIDs)
+}
+
+func (d *Decoder) step() int32 {
+	prevToken := d.token
+
+	// Save old cache state (reuse preallocated slice)
+	d.oldCacheState = d.oldCacheState[:0]
+	for _, c := range d.caches {
+		d.oldCacheState = append(d.oldCacheState, c.State()...)
+	}
+
+	withStream(func() {
+		logits := d.model.Forward(mlx.Reshape(prevToken, 1, 1), d.caches)
+		d.token = sample(logits, d.temp, d.topK, d.topP, d.vocabSize)
+	})
+	// Keep token and new cache state so they survive cleanup
+	mlx.Keep(d.token)
+	for _, c := range d.caches {
+		mlx.Keep(c.State()...)
+	}
+	mlx.AsyncEval(d.token)
+
+	// Sync on previous token (GPU already working on next step)
+	val := prevToken.ItemInt32()
+
+	// Free old token and old cache state
+	prevToken.Free()
+	for _, arr := range d.oldCacheState {
+		arr.Free()
+	}
+	return val
+}
+
+func generate(ctx context.Context, m Model, in input, cb func(output)) error {
+	mlx.EnableCompile()
+	wiredLimit := in.WiredLimitGB
+	if wiredLimit <= 0 {
+		wiredLimit = 32 // default 32GB
+	}
+	mlx.MetalSetWiredLimit(uint64(wiredLimit) << 30)
+
+	temp := in.Temperature
+	if temp < 0 {
+		temp = 0.7
+	}
+
+	tok := m.Tokenizer()
+	dec := NewDecoder(m, temp, in.TopK, in.TopP)
+
+	// Apply chat template - use image template if we have an image
+	prompt := in.Prompt
+	var tokens []int32
+	if mm, ok := m.(MultimodalModel); ok && in.Image != nil {
+		prompt = mm.FormatPromptWithImage(prompt)
+		tokens = tok.Encode(prompt, true)
+		tokens = mm.ExpandImageTokens(tokens) // Expand <start_of_image> to 256 image tokens
+		dec.SetImage(in.Image)
+	} else if cm, ok := m.(ChatModel); ok {
+		prompt = cm.FormatPrompt(prompt)
+		tokens = tok.Encode(prompt, true)
+	} else {
+		tokens = tok.Encode(prompt, true)
+	}
+
+	prefillStart := time.Now()
+	prefillTokens := dec.prefill(tokens)
+	// Prefill measurement should include time to first token (like mlx-lm)
+	// Step() waits for prefill to complete and returns first token
+	firstToken := dec.step()
+	prefillTokSec := float64(prefillTokens) / time.Since(prefillStart).Seconds()
+
+	genStart := time.Now()
+	maxTokens := max(in.MaxTokens, 100)
+	var genTokens int
+
+	// UTF-8 streamer to handle partial multi-byte characters
+	streamer := &utf8Streamer{}
+
+	// Handle first token
+	genTokens++
+	if tok.IsEOS(firstToken) {
+		cb(output{Done: true, PrefillTokSec: prefillTokSec, GenTokSec: 0})
+		return nil
+	}
+	if text := streamer.Write(tok.Decode([]int32{firstToken})); text != "" {
+		cb(output{Text: text})
+	}
+
+	for n := 1; n < maxTokens; n++ {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		token := dec.step()
+		genTokens++
+
+		if tok.IsEOS(token) {
+			break
+		}
+		if text := streamer.Write(tok.Decode([]int32{token})); text != "" {
+			cb(output{Text: text})
+		}
+
+		if n%256 == 0 {
+			mlx.ClearCache()
+		}
+	}
+
+	// Flush any remaining buffered bytes
+	if text := streamer.Flush(); text != "" {
+		cb(output{Text: text})
+	}
+
+	fmt.Printf("\nPeak memory: %.2fGB\n", float64(mlx.MetalGetPeakMemory())/(1<<30))
+	cb(output{Done: true, PrefillTokSec: prefillTokSec,
+		GenTokSec: float64(genTokens) / time.Since(genStart).Seconds()})
+	return nil
+}
--- a/x/imagegen/cmd/engine/image.go
+++ b/x/imagegen/cmd/engine/image.go
+//go:build mlx
+
+package main
+
+import (
+	"fmt"
+	"image"
+	"image/png"
+	"os"
+	"path/filepath"
+
+	"github.com/ollama/ollama/x/imagegen/mlx"
+)
+
+// saveImageArray saves an MLX array as a PNG image.
+// Expected format: [B, C, H, W] with values in [0, 1] range and C=3 (RGB).
+func saveImageArray(arr *mlx.Array, path string) error {
+	img, err := arrayToImage(arr)
+	if err != nil {
+		return err
+	}
+	return savePNG(img, path)
+}
+
+func savePNG(img *image.RGBA, path string) error {
+	if filepath.Ext(path) != ".png" {
+		path = path + ".png"
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	return png.Encode(f, img)
+}
+
+func arrayToImage(arr *mlx.Array) (*image.RGBA, error) {
+	shape := arr.Shape()
+	if len(shape) != 4 {
+		return nil, fmt.Errorf("expected 4D array [B, C, H, W], got %v", shape)
+	}
+
+	// Transform to [H, W, C] for image conversion
+	img := mlx.Squeeze(arr, 0)
+	arr.Free()
+	img = mlx.Transpose(img, 1, 2, 0)
+	img = mlx.Contiguous(img)
+	mlx.Eval(img)
+
+	imgShape := img.Shape()
+	H := int(imgShape[0])
+	W := int(imgShape[1])
+	C := int(imgShape[2])
+
+	if C != 3 {
+		img.Free()
+		return nil, fmt.Errorf("expected 3 channels (RGB), got %d", C)
+	}
+
+	// Copy to CPU and free GPU memory
+	data := img.Data()
+	img.Free()
+
+	// Write directly to Pix slice (faster than SetRGBA)
+	goImg := image.NewRGBA(image.Rect(0, 0, W, H))
+	pix := goImg.Pix
+	for y := 0; y < H; y++ {
+		for x := 0; x < W; x++ {
+			srcIdx := (y*W + x) * C
+			dstIdx := (y*W + x) * 4
+			pix[dstIdx+0] = uint8(clampF(data[srcIdx+0]*255+0.5, 0, 255))
+			pix[dstIdx+1] = uint8(clampF(data[srcIdx+1]*255+0.5, 0, 255))
+			pix[dstIdx+2] = uint8(clampF(data[srcIdx+2]*255+0.5, 0, 255))
+			pix[dstIdx+3] = 255
+		}
+	}
+
+	return goImg, nil
+}
+
+func clampF(v, min, max float32) float32 {
+	if v < min {
+		return min
+	}
+	if v > max {
+		return max
+	}
+	return v
+}
--- a/x/imagegen/cmd/engine/main.go
+++ b/x/imagegen/cmd/engine/main.go
+//go:build mlx
+
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"runtime/pprof"
+
+	"github.com/ollama/ollama/x/imagegen/mlx"
+	"github.com/ollama/ollama/x/imagegen/models/gemma3"
+	"github.com/ollama/ollama/x/imagegen/models/gpt_oss"
+	"github.com/ollama/ollama/x/imagegen/models/llama"
+	"github.com/ollama/ollama/x/imagegen/models/qwen_image"
+	"github.com/ollama/ollama/x/imagegen/models/qwen_image_edit"
+	"github.com/ollama/ollama/x/imagegen/models/zimage"
+	"github.com/ollama/ollama/x/imagegen/safetensors"
+)
+
+// stringSlice is a flag type that accumulates multiple values
+type stringSlice []string
+
+func (s *stringSlice) String() string {
+	return fmt.Sprintf("%v", *s)
+}
+
+func (s *stringSlice) Set(value string) error {
+	*s = append(*s, value)
+	return nil
+}
+
+func main() {
+	modelPath := flag.String("model", "", "Model directory")
+	prompt := flag.String("prompt", "Hello", "Prompt")
+
+	// Text generation params
+	maxTokens := flag.Int("max-tokens", 100, "Max tokens")
+	temperature := flag.Float64("temperature", 0.7, "Temperature")
+	topP := flag.Float64("top-p", 0.9, "Top-p sampling")
+	topK := flag.Int("top-k", 40, "Top-k sampling")
+	imagePath := flag.String("image", "", "Image path for multimodal models")
+
+	// Image generation params
+	width := flag.Int("width", 1024, "Image width")
+	height := flag.Int("height", 1024, "Image height")
+	steps := flag.Int("steps", 9, "Denoising steps")
+	seed := flag.Int64("seed", 42, "Random seed")
+	out := flag.String("output", "output.png", "Output path")
+
+	// Utility flags
+	listTensors := flag.Bool("list", false, "List tensors only")
+	cpuProfile := flag.String("cpuprofile", "", "Write CPU profile to file")
+	gpuCapture := flag.String("gpu-capture", "", "Capture GPU trace to .gputrace file (run with MTL_CAPTURE_ENABLED=1)")
+	layerCache := flag.Bool("layer-cache", false, "Enable layer caching for faster diffusion (Z-Image, Qwen-Image). Not compatible with CFG/negative prompts.")
+	wiredLimitGB := flag.Int("wired-limit", 32, "Metal wired memory limit in GB")
+
+	// Legacy mode flags
+	zimageFlag := flag.Bool("zimage", false, "Z-Image generation")
+	qwenImage := flag.Bool("qwen-image", false, "Qwen-Image text-to-image generation")
+	qwenImageEdit := flag.Bool("qwen-image-edit", false, "Qwen-Image-Edit image editing")
+	var inputImages stringSlice
+	flag.Var(&inputImages, "input-image", "Input image for image editing (can be specified multiple times)")
+	negativePrompt := flag.String("negative-prompt", "", "Negative prompt for CFG (empty = no CFG, matching Python)")
+	cfgScale := flag.Float64("cfg-scale", 4.0, "CFG scale for image editing")
+
+	flag.Parse()
+
+	if *modelPath == "" {
+		flag.Usage()
+		return
+	}
+
+	// CPU profiling
+	if *cpuProfile != "" {
+		f, err := os.Create(*cpuProfile)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer f.Close()
+		if err := pprof.StartCPUProfile(f); err != nil {
+			log.Fatal(err)
+		}
+		defer pprof.StopCPUProfile()
+	}
+
+	var err error
+
+	// Handle legacy mode flags that aren't unified yet
+	switch {
+	case *zimageFlag:
+		m := &zimage.Model{}
+		if loadErr := m.Load(*modelPath); loadErr != nil {
+			log.Fatal(loadErr)
+		}
+		var img *mlx.Array
+		img, err = m.GenerateFromConfig(&zimage.GenerateConfig{
+			Prompt:      *prompt,
+			Width:       int32(*width),
+			Height:      int32(*height),
+			Steps:       *steps,
+			Seed:        *seed,
+			CapturePath: *gpuCapture,
+			LayerCache:  *layerCache,
+		})
+		if err == nil {
+			err = saveImageArray(img, *out)
+		}
+	case *qwenImage:
+		m, loadErr := qwen_image.LoadPersistent(*modelPath)
+		if loadErr != nil {
+			log.Fatal(loadErr)
+		}
+		var img *mlx.Array
+		img, err = m.GenerateFromConfig(&qwen_image.GenerateConfig{
+			Prompt:         *prompt,
+			NegativePrompt: *negativePrompt,
+			CFGScale:       float32(*cfgScale),
+			Width:          int32(*width),
+			Height:         int32(*height),
+			Steps:          *steps,
+			Seed:           *seed,
+			LayerCache:     *layerCache,
+		})
+		if err == nil {
+			err = saveImageArray(img, *out)
+		}
+	case *qwenImageEdit:
+		if len(inputImages) == 0 {
+			log.Fatal("qwen-image-edit requires at least one -input-image")
+		}
+
+		m, loadErr := qwen_image_edit.LoadPersistent(*modelPath)
+		if loadErr != nil {
+			log.Fatal(loadErr)
+		}
+		// For image editing, use 0 for dimensions to auto-detect from input image
+		// unless explicitly overridden from defaults
+		editWidth := int32(0)
+		editHeight := int32(0)
+		if *width != 1024 {
+			editWidth = int32(*width)
+		}
+		if *height != 1024 {
+			editHeight = int32(*height)
+		}
+
+		cfg := &qwen_image_edit.GenerateConfig{
+			Prompt:         *prompt,
+			NegativePrompt: *negativePrompt,
+			CFGScale:       float32(*cfgScale),
+			Width:          editWidth,
+			Height:         editHeight,
+			Steps:          *steps,
+			Seed:           *seed,
+		}
+
+		var img *mlx.Array
+		img, err = m.EditFromConfig(inputImages, cfg)
+		if err == nil {
+			err = saveImageArray(img, *out)
+		}
+	case *listTensors:
+		err = listModelTensors(*modelPath)
+	default:
+		// llm path
+		m, err := load(*modelPath)
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		// Load image if provided and model supports it
+		var image *mlx.Array
+		if *imagePath != "" {
+			if mm, ok := m.(interface{ ImageSize() int32 }); ok {
+				image, err = gemma3.ProcessImage(*imagePath, mm.ImageSize())
+				if err != nil {
+					log.Fatal("load image:", err)
+				}
+			} else {
+				log.Fatal("model does not support image input")
+			}
+		}
+
+		err = generate(context.Background(), m, input{
+			Prompt:       *prompt,
+			Image:        image,
+			MaxTokens:    *maxTokens,
+			Temperature:  float32(*temperature),
+			TopP:         float32(*topP),
+			TopK:         *topK,
+			WiredLimitGB: *wiredLimitGB,
+		}, func(out output) {
+			if out.Text != "" {
+				fmt.Print(out.Text)
+			}
+			if out.Done {
+				fmt.Printf("\n\n[prefill: %.1f tok/s, gen: %.1f tok/s]\n", out.PrefillTokSec, out.GenTokSec)
+			}
+		})
+	}
+
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+
+func listModelTensors(modelPath string) error {
+	weights, err := safetensors.LoadModelWeights(modelPath)
+	if err != nil {
+		return err
+	}
+	for _, name := range weights.ListTensors() {
+		info, _ := weights.GetTensorInfo(name)
+		fmt.Printf("%s: %v (%s)\n", name, info.Shape, info.Dtype)
+	}
+	return nil
+}
+
+// loadModel builds and evaluates a model using the common load pattern.
+// Release safetensors BEFORE eval - lazy arrays have captured their data,
+// and this reduces peak memory by ~6GB (matches mlx-lm behavior).
+func loadModel[T Model](build func() T, cleanup func()) T {
+	m := build()
+	weights := mlx.Collect(m)
+	cleanup()
+	mlx.Eval(weights...)
+	return m
+}
+
+func load(modelPath string) (Model, error) {
+	kind, err := detectModelKind(modelPath)
+	if err != nil {
+		return nil, fmt.Errorf("detect model kind: %w", err)
+	}
+
+	switch kind {
+	case "gpt_oss":
+		return gpt_oss.Load(modelPath)
+	case "gemma3":
+		return gemma3.Load(modelPath)
+	case "gemma3_text":
+		return gemma3.LoadText(modelPath)
+	default:
+		return llama.Load(modelPath)
+	}
+}
+
+func detectModelKind(modelPath string) (string, error) {
+	indexPath := filepath.Join(modelPath, "model_index.json")
+	if _, err := os.Stat(indexPath); err == nil {
+		data, err := os.ReadFile(indexPath)
+		if err != nil {
+			return "zimage", nil
+		}
+		var index struct {
+			ClassName string `json:"_class_name"`
+		}
+		if err := json.Unmarshal(data, &index); err == nil {
+			switch index.ClassName {
+			case "FluxPipeline", "ZImagePipeline":
+				return "zimage", nil
+			}
+		}
+		return "zimage", nil
+	}
+
+	configPath := filepath.Join(modelPath, "config.json")
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return "", fmt.Errorf("no config.json or model_index.json found: %w", err)
+	}
+
+	var cfg struct {
+		ModelType string `json:"model_type"`
+	}
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return "", fmt.Errorf("parse config.json: %w", err)
+	}
+
+	return cfg.ModelType, nil
+}
--- a/x/imagegen/cmd/engine/sample.go
+++ b/x/imagegen/cmd/engine/sample.go
+//go:build mlx
+
+package main
+
+import "github.com/ollama/ollama/x/imagegen/mlx"
+
+// sampleTopK samples from top-k logits using global random state
+func sampleTopK(scaledLogits *mlx.Array, k int) *mlx.Array {
+	neg := mlx.Neg(scaledLogits)
+	indices := mlx.Argpartition(neg, k-1, -1)
+	topKIdx := mlx.Slice(indices, []int32{0}, []int32{int32(k)})
+	values := mlx.TakeAlongAxis(scaledLogits, topKIdx, -1)
+	sampled := mlx.RandomCategorical(values, -1, 1)
+	return mlx.Take(topKIdx, sampled, -1)
+}
+
+// sampleTopP samples using nucleus sampling with global random state
+func sampleTopP(scaledLogits *mlx.Array, p float32, vocabSize int32) *mlx.Array {
+	sorted := mlx.Argsort(mlx.Neg(scaledLogits), -1)
+	sortedLogits := mlx.TakeAlongAxis(scaledLogits, sorted, -1)
+	probs := mlx.Softmax(sortedLogits, -1)
+	cumProbs := mlx.Cumsum(probs, -1)
+	mask := mlx.LessScalar(cumProbs, p)
+	negInf := mlx.FullDtype(float32(-1e9), scaledLogits.Dtype(), vocabSize)
+	masked := mlx.Where(mask, sortedLogits, negInf)
+	sampled := mlx.RandomCategorical(masked, -1, 1)
+	return mlx.Take(sorted, sampled, -1)
+}
+
+// sample samples from logits at the last position
+func sample(logits *mlx.Array, temp float32, topK int, topP float32, vocab int32) *mlx.Array {
+	// Get last position logits: [1, L, vocab] -> [vocab]
+	shape := logits.Shape()
+	seqLen := shape[1]
+	lastLogits := mlx.Slice(logits, []int32{0, seqLen - 1, 0}, []int32{1, seqLen, vocab})
+	lastLogits = mlx.Reshape(lastLogits, vocab)
+
+	if temp == 0 {
+		return mlx.Argmax(lastLogits, -1, false)
+	}
+	scaled := mlx.DivScalar(lastLogits, temp)
+	if topK > 0 && topK < int(vocab) {
+		return sampleTopK(scaled, topK)
+	}
+	if topP > 0 && topP < 1.0 {
+		return sampleTopP(scaled, topP, vocab)
+	}
+	return mlx.RandomCategorical(scaled, -1, 1)
+}
--- a/x/imagegen/mlx/README.md
+++ b/x/imagegen/mlx/README.md
+# MLX Memory Management
+
+| This package will get consolidated with `x/ml/backend/mlx` in the future.
+
+## Automatic Tracking
+
+All arrays are automatically tracked when created. On `Eval()`, non-kept arrays are freed.
+
+### API
+
+```go
+result := mlx.Matmul(x, w) // arrays automatically tracked
+mlx.Eval(result)           // free non-kept, eval result (auto-kept)
+```
+
+### Key Functions
+
+- `mlx.Eval(outputs...)` - free non-kept arrays, then evaluate (outputs auto-kept)
+- `mlx.AsyncEval(outputs...)` - async version of Eval (outputs auto-kept)
+- `mlx.Keep(arrays...)` - mark arrays to survive cleanup (for weights, caches)
+- `array.Free()` - mark array for cleanup on next Eval
+
+### Loop Pattern
+
+```go
+for step := 0; step < maxTokens; step++ {
+    logits := model.Forward(token, caches)
+    oldToken := token
+    token = sample(logits)
+
+    // Keep cache state across iterations
+    for _, c := range caches {
+        mlx.Keep(c.State()...)
+    }
+
+    oldToken.Free()       // mark for cleanup
+    mlx.AsyncEval(token)  // frees old, evals new
+}
+```
+
+### Notes
+
+- `Eval()` and `AsyncEval()` auto-keep their outputs
+- `Free()` marks for cleanup - actual free happens during next Eval
+- Use `Keep()` for weights and cache state that must survive multiple Eval cycles
+- Arrays created inside compiled closures are managed by MLX, not tracked
--- a/x/imagegen/mlx/compile.go
+++ b/x/imagegen/mlx/compile.go
+//go:build mlx
+
+package mlx
+
+/*
+#include "mlx/c/mlx.h"
+#include <stdlib.h>
+
+// Forward declaration for Go callback
+extern int goClosureCallback(mlx_vector_array* res, mlx_vector_array input, void* payload);
+
+// Destructor for payload (Go handle)
+extern void goClosureDestructor(void* payload);
+*/
+import "C"
+import (
+	"runtime/cgo"
+	"sync"
+	"unsafe"
+)
+
+// inClosureCallback is set to true during closure callback execution.
+var inClosureCallback bool
+var closureCallbackMu sync.Mutex
+
+// InClosureCallback returns true if we're currently executing inside a closure callback.
+func InClosureCallback() bool {
+	closureCallbackMu.Lock()
+	defer closureCallbackMu.Unlock()
+	return inClosureCallback
+}
+
+// CompiledFunc is a compiled MLX function that can be called efficiently.
+// All intermediate arrays during execution stay inside MLX - only inputs
+// and outputs cross the Go boundary.
+type CompiledFunc struct {
+	closure  C.mlx_closure
+	compiled C.mlx_closure
+}
+
+// ClosureFunc is the signature for functions that can be compiled.
+// It takes a slice of input arrays and returns a slice of output arrays.
+type ClosureFunc func(inputs []*Array) []*Array
+
+// Compile compiles a Go function into an optimized MLX closure.
+// The function is traced once during compilation, then subsequent calls
+// run the optimized graph without creating Go intermediate arrays.
+//
+// Example:
+//
+//	compiled := mlx.Compile(func(inputs []*mlx.Array) []*mlx.Array {
+//	    a, b := inputs[0], inputs[1]
+//	    c := mlx.Add(a, b)
+//	    d := mlx.Mul(c, c)
+//	    return []*mlx.Array{d}
+//	})
+//	defer compiled.Free()
+//
+//	result := compiled.Call(x, y)[0]
+func Compile(fn ClosureFunc) *CompiledFunc {
+	return CompileShapeless(fn, false)
+}
+
+// CompileShapeless compiles with optional shapeless mode.
+// If shapeless=true, the function works for any input shape after tracing.
+func CompileShapeless(fn ClosureFunc, shapeless bool) *CompiledFunc {
+	// Create a cgo.Handle to prevent the Go function from being GC'd
+	handle := cgo.NewHandle(fn)
+
+	// Create the closure from the Go callback
+	closure := C.mlx_closure_new_func_payload(
+		(*[0]byte)(C.goClosureCallback),
+		unsafe.Pointer(handle),
+		(*[0]byte)(C.goClosureDestructor),
+	)
+
+	// Compile the closure
+	compiled := C.mlx_closure_new()
+	C.mlx_compile(&compiled, closure, C.bool(shapeless))
+
+	return &CompiledFunc{
+		closure:  closure,
+		compiled: compiled,
+	}
+}
+
+// Call invokes the compiled function with the given inputs.
+func (cf *CompiledFunc) Call(inputs ...*Array) []*Array {
+	// Pack inputs into vector
+	inputVec := C.mlx_vector_array_new()
+	for _, arr := range inputs {
+		C.mlx_vector_array_append_value(inputVec, arr.c)
+	}
+
+	// Apply compiled closure
+	outputVec := C.mlx_vector_array_new()
+	C.mlx_closure_apply(&outputVec, cf.compiled, inputVec)
+	C.mlx_vector_array_free(inputVec)
+
+	// Unpack outputs
+	numOutputs := int(C.mlx_vector_array_size(outputVec))
+	outputs := make([]*Array, numOutputs)
+	for i := 0; i < numOutputs; i++ {
+		var arr C.mlx_array
+		C.mlx_vector_array_get(&arr, outputVec, C.size_t(i))
+		outputs[i] = newArray(arr)
+	}
+	C.mlx_vector_array_free(outputVec)
+
+	return outputs
+}
+
+// CallEval invokes the compiled function and evaluates the results.
+func (cf *CompiledFunc) CallEval(inputs ...*Array) []*Array {
+	outputs := cf.Call(inputs...)
+	Eval(outputs...)
+	return outputs
+}
+
+// Free releases the compiled function resources.
+func (cf *CompiledFunc) Free() {
+	C.mlx_closure_free(cf.compiled)
+	C.mlx_closure_free(cf.closure)
+}
+
+// borrowArray wraps a C array WITHOUT setting up GC cleanup.
+// Use this for arrays we don't own (e.g., borrowed references in callbacks).
+func borrowArray(array C.mlx_array) *Array {
+	return &Array{c: array}
+}
+
+//export goClosureCallback
+func goClosureCallback(res *C.mlx_vector_array, input C.mlx_vector_array, payload unsafe.Pointer) C.int {
+	// Set flag to disable AddCleanup during callback
+	closureCallbackMu.Lock()
+	inClosureCallback = true
+	closureCallbackMu.Unlock()
+	defer func() {
+		closureCallbackMu.Lock()
+		inClosureCallback = false
+		closureCallbackMu.Unlock()
+	}()
+
+	// Recover the Go function from the handle
+	handle := cgo.Handle(payload)
+	fn := handle.Value().(ClosureFunc)
+
+	// Convert input vector to Go slice - use borrowArray since MLX owns these
+	numInputs := int(C.mlx_vector_array_size(input))
+	inputs := make([]*Array, numInputs)
+	for i := 0; i < numInputs; i++ {
+		var arr C.mlx_array
+		C.mlx_vector_array_get(&arr, input, C.size_t(i))
+		inputs[i] = borrowArray(arr) // Don't set up cleanup - MLX owns these
+	}
+
+	// Call the Go function
+	outputs := fn(inputs)
+
+	// Build output vector
+	*res = C.mlx_vector_array_new()
+	for _, arr := range outputs {
+		C.mlx_vector_array_append_value(*res, arr.c)
+	}
+
+	return 0
+}
+
+//export goClosureDestructor
+func goClosureDestructor(payload unsafe.Pointer) {
+	handle := cgo.Handle(payload)
+	handle.Delete()
+}
--- a/x/imagegen/mlx/mlx.go
+++ b/x/imagegen/mlx/mlx.go
--- a/x/imagegen/mlx/mlx_test.go
+++ b/x/imagegen/mlx/mlx_test.go
--- a/x/imagegen/models/gemma3/gemma3.go
+++ b/x/imagegen/models/gemma3/gemma3.go
--- a/x/imagegen/models/gemma3/image.go
+++ b/x/imagegen/models/gemma3/image.go
+//go:build mlx
+
+package gemma3
+
+import (
+	"fmt"
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+	"os"
+
+	"github.com/ollama/ollama/x/imagegen/mlx"
+	"golang.org/x/image/draw"
+)
+
+// ProcessImage loads and preprocesses an image for the vision tower
+// Returns [1, H, W, C] tensor in NHWC format normalized for SigLIP
+func ProcessImage(path string, imageSize int32) (*mlx.Array, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("open image: %w", err)
+	}
+	defer f.Close()
+
+	img, _, err := image.Decode(f)
+	if err != nil {
+		return nil, fmt.Errorf("decode image: %w", err)
+	}
+
+	return ProcessImageData(img, imageSize)
+}
+
+// ProcessImageData preprocesses an image.Image for the vision tower
+func ProcessImageData(img image.Image, imageSize int32) (*mlx.Array, error) {
+	// Resize to target size using bilinear interpolation
+	resized := image.NewRGBA(image.Rect(0, 0, int(imageSize), int(imageSize)))
+	draw.BiLinear.Scale(resized, resized.Bounds(), img, img.Bounds(), draw.Over, nil)
+
+	// Convert to float32 array [H, W, C] and normalize
+	// SigLIP normalization: (pixel / 255.0 - 0.5) / 0.5 = pixel / 127.5 - 1.0
+	data := make([]float32, imageSize*imageSize*3)
+	idx := 0
+	for y := int32(0); y < imageSize; y++ {
+		for x := int32(0); x < imageSize; x++ {
+			r, g, b, _ := resized.At(int(x), int(y)).RGBA()
+			// RGBA returns 16-bit values, convert to 8-bit
+			data[idx] = float32(r>>8)/127.5 - 1.0
+			data[idx+1] = float32(g>>8)/127.5 - 1.0
+			data[idx+2] = float32(b>>8)/127.5 - 1.0
+			idx += 3
+		}
+	}
+
+	// Create MLX array [1, H, W, C] for NHWC layout
+	arr := mlx.NewArrayFloat32(data, []int32{1, imageSize, imageSize, 3})
+	mlx.Eval(arr) // Materialize to prevent use-after-free
+	return arr, nil
+}
--- a/x/imagegen/models/gemma3/projector.go
+++ b/x/imagegen/models/gemma3/projector.go
+//go:build mlx
+
+package gemma3
+
+import (
+	"github.com/ollama/ollama/x/imagegen/mlx"
+	"github.com/ollama/ollama/x/imagegen/nn"
+)
+
+// MultiModalProjector projects vision features to text embedding space
+type MultiModalProjector struct {
+	// mm_input_projection_weight: [vision_hidden, text_hidden]
+	InputProjection *mlx.Array  `weight:"mm_input_projection_weight"`
+	SoftEmbNorm     *nn.RMSNorm `weight:"mm_soft_emb_norm"`
+
+	// Precomputed (1 + weight) for Gemma-style RMSNorm
+	SoftEmbNormScaled *mlx.Array `weight:"-"`
+}
+
+// Forward projects vision features to text space
+// Input: [B, num_patches, vision_hidden] (e.g., [1, 4096, 1152])
+// Output: [B, num_image_tokens, text_hidden] (e.g., [1, 256, 2560])
+func (p *MultiModalProjector) Forward(visionFeatures *mlx.Array, eps float32) *mlx.Array {
+	// Average pool 4x4: [B, 4096, 1152] -> [B, 256, 1152]
+	// 4096 patches = 64x64 grid, pool to 16x16 = 256 tokens
+	B := visionFeatures.Shape()[0]
+	visionHidden := visionFeatures.Shape()[2]
+
+	// Reshape to [B, 64, 64, hidden]
+	gridSize := int32(64) // sqrt(4096)
+	pooledSize := int32(16) // 64/4
+	h := mlx.Reshape(visionFeatures, B, gridSize, gridSize, visionHidden)
+
+	// Reshape to [B, 16, 4, 16, 4, hidden] for 4x4 pooling
+	h = mlx.Reshape(h, B, pooledSize, 4, pooledSize, 4, visionHidden)
+
+	// Average over pooling dimensions (axes 2 and 4)
+	h = mlx.Mean(h, 4, false)
+	h = mlx.Mean(h, 2, false)
+
+	// h is now [B, 16, 16, hidden], reshape to [B, 256, hidden]
+	numTokens := pooledSize * pooledSize
+	h = mlx.Reshape(h, B, numTokens, visionHidden)
+
+	// Apply Gemma-style RMS norm (use precomputed 1 + weight)
+	h = mlx.RMSNorm(h, p.SoftEmbNormScaled, eps)
+
+	// Project to text space: [B, 256, vision_hidden] @ [vision_hidden, text_hidden]
+	return mlx.Linear(h, p.InputProjection)
+}
--- a/x/imagegen/models/gemma3/vision.go
+++ b/x/imagegen/models/gemma3/vision.go
+//go:build mlx
+
+package gemma3
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/x/imagegen/mlx"
+	"github.com/ollama/ollama/x/imagegen/nn"
+)
+
+// VisionConfig holds configuration for the SigLIP vision tower
+type VisionConfig struct {
+	HiddenSize        int32 `json:"hidden_size"`
+	ImageSize         int32 `json:"image_size"`
+	IntermediateSize  int32 `json:"intermediate_size"`
+	NumAttentionHeads int32 `json:"num_attention_heads"`
+	NumHiddenLayers   int32 `json:"num_hidden_layers"`
+	PatchSize         int32 `json:"patch_size"`
+}
+
+// VisionTower is the SigLIP vision encoder
+type VisionTower struct {
+	Embeddings    *VisionEmbeddings     `weight:"vision_model.embeddings"`
+	Encoder       []*VisionEncoderLayer `weight:"vision_model.encoder.layers"`
+	PostLayerNorm *nn.LayerNorm         `weight:"vision_model.post_layernorm"`
+	Config        *VisionConfig
+}
+
+// VisionEmbeddings handles patch and position embeddings
+type VisionEmbeddings struct {
+	// PatchWeight: [O, C, kH, kW] from PyTorch, transposed to [O, kH, kW, C] for MLX
+	PatchWeight *mlx.Array    `weight:"patch_embedding.weight"`
+	PatchBias   *mlx.Array    `weight:"patch_embedding.bias"`
+	PosEmbed    *nn.Embedding `weight:"position_embedding"`
+}
+
+// VisionEncoderLayer is a single transformer encoder layer
+type VisionEncoderLayer struct {
+	LayerNorm1 *nn.LayerNorm     `weight:"layer_norm1"`
+	Attention  *VisionAttention  `weight:"self_attn"`
+	LayerNorm2 *nn.LayerNorm     `weight:"layer_norm2"`
+	MLP        *VisionMLP        `weight:"mlp"`
+}
+
+// VisionAttention implements multi-head self-attention
+type VisionAttention struct {
+	QProj   *nn.Linear `weight:"q_proj"`
+	KProj   *nn.Linear `weight:"k_proj"`
+	VProj   *nn.Linear `weight:"v_proj"`
+	OutProj *nn.Linear `weight:"out_proj"`
+}
+
+// VisionMLP is the feed-forward network
+type VisionMLP struct {
+	FC1 *nn.Linear `weight:"fc1"`
+	FC2 *nn.Linear `weight:"fc2"`
+}
+
+// Forward runs the vision tower on preprocessed images
+// Input: [B, H, W, C] normalized image tensor (NHWC layout for MLX)
+// Output: [B, num_patches, hidden_size]
+func (v *VisionTower) Forward(x *mlx.Array) *mlx.Array {
+	// Patch embedding conv: input [B, H, W, C], weight [O, kH, kW, C] -> [B, grid, grid, O]
+	// Weight comes as [O, C, kH, kW] from PyTorch, transpose to [O, kH, kW, C]
+	weight := mlx.Transpose(v.Embeddings.PatchWeight, 0, 2, 3, 1)
+	h := mlx.Conv2d(x, weight, v.Config.PatchSize, 0) // stride=patch_size, no padding
+
+	// Add bias: [O] -> [1, 1, 1, O] for broadcasting
+	bias := mlx.Reshape(v.Embeddings.PatchBias, 1, 1, 1, v.Embeddings.PatchBias.Shape()[0])
+	h = mlx.Add(h, bias)
+
+	// h is [B, grid, grid, hidden], flatten to [B, num_patches, hidden]
+	B := h.Shape()[0]
+	gridH, gridW := h.Shape()[1], h.Shape()[2]
+	hidden := h.Shape()[3]
+	numPatches := gridH * gridW
+	h = mlx.Reshape(h, B, numPatches, hidden)
+
+	// Add position embeddings
+	posIds := mlx.ArangeInt(0, numPatches, 1, mlx.DtypeInt32)
+	posEmbed := v.Embeddings.PosEmbed.Forward(posIds)
+	h = mlx.Add(h, posEmbed)
+
+	// Encoder layers
+	headDim := float32(v.Config.HiddenSize / v.Config.NumAttentionHeads)
+	scale := float32(1.0 / math.Sqrt(float64(headDim)))
+	for _, layer := range v.Encoder {
+		h = layer.Forward(h, v.Config, scale)
+	}
+
+	// Final layer norm
+	h = v.PostLayerNorm.Forward(h)
+
+	return h
+}
+
+// Forward runs a vision encoder layer
+func (l *VisionEncoderLayer) Forward(x *mlx.Array, cfg *VisionConfig, scale float32) *mlx.Array {
+	// Pre-norm attention
+	h := l.LayerNorm1.Forward(x)
+	h = l.Attention.Forward(h, cfg, scale)
+	x = mlx.Add(x, h)
+
+	// Pre-norm MLP
+	h = l.LayerNorm2.Forward(x)
+	h = l.MLP.Forward(h)
+	return mlx.Add(x, h)
+}
+
+// Forward runs multi-head self-attention
+func (a *VisionAttention) Forward(x *mlx.Array, cfg *VisionConfig, scale float32) *mlx.Array {
+	B, L := x.Shape()[0], x.Shape()[1]
+	headDim := cfg.HiddenSize / cfg.NumAttentionHeads
+
+	q := a.QProj.Forward(x)
+	k := a.KProj.Forward(x)
+	v := a.VProj.Forward(x)
+
+	// Reshape to [B, num_heads, L, head_dim]
+	q = mlx.Transpose(mlx.Reshape(q, B, L, cfg.NumAttentionHeads, headDim), 0, 2, 1, 3)
+	k = mlx.Transpose(mlx.Reshape(k, B, L, cfg.NumAttentionHeads, headDim), 0, 2, 1, 3)
+	v = mlx.Transpose(mlx.Reshape(v, B, L, cfg.NumAttentionHeads, headDim), 0, 2, 1, 3)
+
+	// Scaled dot-product attention (no causal mask for vision)
+	out := mlx.ScaledDotProductAttention(q, k, v, scale, false)
+
+	// Reshape back: [B, num_heads, L, head_dim] -> [B, L, hidden]
+	out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.HiddenSize)
+
+	return a.OutProj.Forward(out)
+}
+
+// Forward runs the MLP with GELU activation
+func (m *VisionMLP) Forward(x *mlx.Array) *mlx.Array {
+	h := mlx.GELU(m.FC1.Forward(x))
+	return m.FC2.Forward(h)
+}
--- a/x/imagegen/models/gpt_oss/gpt_oss.go
+++ b/x/imagegen/models/gpt_oss/gpt_oss.go
--- a/x/imagegen/models/llama/llama.go
+++ b/x/imagegen/models/llama/llama.go
--- a/x/imagegen/models/qwen_image/pipeline_test.go
+++ b/x/imagegen/models/qwen_image/pipeline_test.go
+//go:build mlx
+
+package qwen_image
+
+import (
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/x/imagegen/mlx"
+)
+
+// TestPipelineOutput runs the full pipeline (integration test).
+// Skips if model weights not found. Requires ~50GB VRAM.
+func TestPipelineOutput(t *testing.T) {
+	modelPath := "../../../weights/Qwen-Image-2512"
+	if _, err := os.Stat(modelPath); os.IsNotExist(err) {
+		t.Skip("Skipping: model weights not found at " + modelPath)
+	}
+
+	// Load model
+	pm, err := LoadPersistent(modelPath)
+	if err != nil {
+		t.Skipf("Skipping: failed to load model: %v", err)
+	}
+
+	// Run 2-step pipeline (minimum for stable scheduler)
+	cfg := &GenerateConfig{
+		Prompt: "a cat",
+		Width:  256,
+		Height: 256,
+		Steps:  2,
+		Seed:   42,
+	}
+
+	output, err := pm.GenerateFromConfig(cfg)
+	if err != nil {
+		t.Fatalf("Pipeline failed: %v", err)
+	}
+	mlx.Eval(output)
+
+	// Verify output shape [1, C, H, W]
+	shape := output.Shape()
+	if len(shape) != 4 {
+		t.Errorf("Expected 4D output, got %v", shape)
+	}
+	if shape[0] != 1 || shape[1] != 3 || shape[2] != cfg.Height || shape[3] != cfg.Width {
+		t.Errorf("Shape mismatch: got %v, expected [1, 3, %d, %d]", shape, cfg.Height, cfg.Width)
+	}
+
+	// Verify values in expected range [0, 1]
+	data := output.Data()
+	minVal, maxVal := float32(1.0), float32(0.0)
+	for _, v := range data {
+		if v < minVal {
+			minVal = v
+		}
+		if v > maxVal {
+			maxVal = v
+		}
+	}
+	t.Logf("Output range: [%.4f, %.4f]", minVal, maxVal)
+
+	if minVal < -0.1 || maxVal > 1.1 {
+		t.Errorf("Output values out of range: [%.4f, %.4f]", minVal, maxVal)
+	}
+}
--- a/x/imagegen/models/qwen_image/qwen25vl.go
+++ b/x/imagegen/models/qwen_image/qwen25vl.go
--- a/x/imagegen/models/qwen_image/qwen_image.go
+++ b/x/imagegen/models/qwen_image/qwen_image.go
--- a/x/imagegen/models/qwen_image/scheduler.go
+++ b/x/imagegen/models/qwen_image/scheduler.go