gpt-oss (#11672)

* bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>

gpt-oss (#11672)
* bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
fa7776fd · Michael Yang · GitHub · 0d38b665 · fa7776fd · fa7776fd
Unverified Commit fa7776fd authored Aug 05, 2025 by Michael Yang Committed by GitHub Aug 05, 2025
16 changed files
--- a/ml/nn/linear.go
+++ b/ml/nn/linear.go
@@ -15,3 +15,26 @@ func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
 	return t
 }
+type LinearBatch struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+func (m *LinearBatch) Forward(ctx ml.Context, t, indices ml.Tensor) ml.Tensor {
+	t = m.Weight.MulmatID(ctx, t, indices)
+	if m.Bias != nil {
+		var bias ml.Tensor
+		if len(indices.Shape()) > 1 {
+			// FIXME: Rows does not support 2D indices for a 2D input tensor so reshape indices to 1D.
+			bias = m.Bias.Rows(ctx, indices.Contiguous(ctx, indices.Dim(0)*indices.Dim(1))).
+				Duplicate(ctx).
+				Reshape(ctx, m.Bias.Dim(0), indices.Dim(0), indices.Dim(1))
+		} else {
+			bias = m.Bias.Rows(ctx, indices)
+		}
+		t = t.Add(ctx, bias)
+	}
+	return t
+}
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -4,9 +4,15 @@ import "github.com/ollama/ollama/ml"
 // Options contains optional parameters for RoPE function
 type Options struct {
-	OriginalContextLength int
 	Type                  int
 	Factors               ml.Tensor
+	OriginalContextLength int
+	// YaRN options
+	ExtrapolationFactor,
+	AttentionFactor,
+	BetaFast,
+	BetaSlow float32
 }
 // WithOriginalContextLength sets a custom context length
@@ -31,3 +37,15 @@ func WithFactors(factors ml.Tensor) func(*Options) {
 		}
 	}
 }
+func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
+	return func(opts *Options) {
+		opts.ExtrapolationFactor = extrapolationFactor
+	}
+}
+func WithAttentionFactor(attentionFactor float32) func(*Options) {
+	return func(opts *Options) {
+		opts.AttentionFactor = attentionFactor
+	}
+}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -22,7 +22,7 @@ var _ TextProcessor = (*BytePairEncoding)(nil)
 func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
 	return BytePairEncoding{
-		pre:   regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
+		pre:   regexp2.MustCompile(pre, regexp2.None),
 		vocab: vocab,
 	}
 }

--- a/model/models/gptoss/model.go
+++ b/model/models/gptoss/model.go
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -4,6 +4,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
+	_ "github.com/ollama/ollama/model/models/gptoss"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"

--- a/openai/openai.go
+++ b/openai/openai.go
--- a/server/harmonyparser.go
+++ b/server/harmonyparser.go
--- a/server/harmonyparser_test.go
+++ b/server/harmonyparser_test.go
--- a/server/images.go
+++ b/server/images.go
@@ -111,7 +111,8 @@ func (m *Model) Capabilities() []model.Capability {
 	// Check for thinking capability
 	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if openingTag != "" && closingTag != "" {
+	hasTags := openingTag != "" && closingTag != ""
+	if hasTags || m.Config.ModelFamily == "gptoss" {
 		capabilities = append(capabilities, model.CapabilityThinking)
 	}

--- a/server/prompt.go
+++ b/server/prompt.go
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -209,7 +209,7 @@ func TestChatPrompt(t *testing.T) {
 			model := tt.model
 			opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
 			think := false
-			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &think)
+			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &api.ThinkValue{Value: think})
 			if tt.error == nil && err != nil {
 				t.Fatal(err)
 			} else if tt.error != nil && err != tt.error {

--- a/server/routes.go
+++ b/server/routes.go
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -150,7 +150,7 @@ func TestGenerateChat(t *testing.T) {
 			Messages: []api.Message{
 				{Role: "user", Content: "Hello!"},
 			},
-			Think: &think,
+			Think: &api.ThinkValue{Value: think},
 		})
 		if w.Code != http.StatusBadRequest {

--- a/server/routes_harmony_streaming_test.go
+++ b/server/routes_harmony_streaming_test.go
--- a/template/template.go
+++ b/template/template.go
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -26,6 +26,10 @@ type Parser struct {
 	n      int
 }
+func (p *Parser) GetBuffer() []byte {
+	return p.buffer
+}
 // NewParser creates a new tool call parser from a model's chat
 // template and a list of provided tools.
 func NewParser(tmpl *template.Template, tools []api.Tool) *Parser {