Merge pull request #12417 from ollama/drifkin/qwen3-coder-unicode

parsers: fix unicode handling for qwen3-coder

Merge pull request #12417 from ollama/drifkin/qwen3-coder-unicode
parsers: fix unicode handling for qwen3-coder
34efbbd3 · Devon Rifkin · GitHub · 5a56ff3c · 05ba4ca1 · 34efbbd3
Unverified Commit 34efbbd3 authored Sep 25, 2025 by Devon Rifkin Committed by GitHub Sep 25, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 231 additions and 4 deletions

model/parsers/qwen3coder.go model/parsers/qwen3coder.go +14 -4

model/parsers/qwen3coder_test.go model/parsers/qwen3coder_test.go +217 -0

No files found.
--- a/model/parsers/qwen3coder.go
+++ b/model/parsers/qwen3coder.go
@@ -11,6 +11,7 @@ import (
 	"strconv"
 	"strings"
 	"unicode"
+	"unicode/utf8"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/logutil"
@@ -204,12 +205,21 @@ func overlap(s, delim string) int {
 }
 func trailingWhitespaceLen(s string) int {
-	for i := len(s) - 1; i >= 0; i-- {
+	remaining := s
-		if !unicode.IsSpace(rune(s[i])) {
+	total := 0
-			return len(s) - i - 1
+	for len(remaining) > 0 {
+		r, size := utf8.DecodeLastRuneInString(remaining)
+		// if it's an invalid utf8 rune, assume it isn't whitespace
+		if r == utf8.RuneError && size == 1 {
+			break
+		}
+		if !unicode.IsSpace(r) {
+			break
 		}
+		total += size
+		remaining = remaining[:len(remaining)-size]
 	}
-	return len(s)
+	return total
 }
 type XMLFunctionCall struct {

--- a/model/parsers/qwen3coder_test.go
+++ b/model/parsers/qwen3coder_test.go
@@ -166,6 +166,137 @@ func TestQwenParserStreaming(t *testing.T) {
 				},
 			},
 		},
+		{
+			desc: "unicode content",
+			steps: []step{
+				{
+					input: "你好 🌍<tool_call>test</tool_call>مرحبا",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "你好 🌍"},
+						qwenEventRawToolCall{raw: "test"},
+						qwenEventContent{content: "مرحبا"},
+					},
+				},
+			},
+		},
+		{
+			desc: "arabic text handling",
+			steps: []step{
+				{
+					input:      "مرحبا بالعالم",
+					wantEvents: []qwenEvent{qwenEventContent{content: "مرحبا بالعالم"}},
+				},
+			},
+		},
+		{
+			desc: "emoji passthrough",
+			steps: []step{
+				{
+					input:      "✅",
+					wantEvents: []qwenEvent{qwenEventContent{content: "✅"}},
+				},
+			},
+		},
+		{
+			desc: "emoji after tool call",
+			steps: []step{
+				{
+					input: "<tool_call>test</tool_call>完成 ✅",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "test"},
+						qwenEventContent{content: "完成 ✅"},
+					},
+				},
+			},
+		},
+		{
+			desc: "unicode streaming with whitespace handling",
+			steps: []step{
+				{
+					input: "مرحبا",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "مرحبا"},
+					},
+				},
+				{
+					input:      " \n",
+					wantEvents: []qwenEvent{},
+				},
+				{
+					input: "世界",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: " \n世界"},
+					},
+				},
+			},
+		},
+		{
+			desc: "non-breaking space withheld across chunks",
+			steps: []step{
+				{
+					input: "Hello\u00a0",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "Hello"},
+					},
+				},
+				{
+					input: "world",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "\u00a0world"},
+					},
+				},
+			},
+		},
+		{
+			desc: "ideographic space before partial tool",
+			steps: []step{
+				{
+					input: "Hello\u3000<tool",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "Hello"},
+					},
+				},
+				{
+					input:      "_call>abc",
+					wantEvents: []qwenEvent{},
+				},
+				{
+					input: "</tool_call>def",
+					wantEvents: []qwenEvent{
+						qwenEventRawToolCall{raw: "abc"},
+						qwenEventContent{content: "def"},
+					},
+				},
+			},
+		},
+		{
+			desc: "ideographic space before partial tool fakeout",
+			steps: []step{
+				{
+					input: "Hello\u3000<tool",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "Hello"},
+					},
+				},
+				{
+					input: "fakeout>abc",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "\u3000<toolfakeout>abc"},
+					},
+				},
+			},
+		},
+		{
+			desc: "unicode with partial tool tag",
+			steps: []step{
+				{
+					input: "测试🎯 <to",
+					wantEvents: []qwenEvent{
+						qwenEventContent{content: "测试🎯"},
+					},
+				},
+			},
+		},
 	}
 	anyOnlies := false
@@ -347,6 +478,27 @@ ls && echo "a > b and a < b"
 				},
 			},
 		},
+		{
+			name:  "unicode in function names and parameters",
+			tools: []api.Tool{},
+			rawToolCall: `<function=获取天气>
+<parameter=城市>
+北京
+</parameter>
+<parameter=message>
+Hello! 你好! 🌟 مرحبا
+</parameter>
+</function>`,
+			wantToolCall: api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name: "获取天气",
+					Arguments: map[string]any{
+						"城市":      "北京",
+						"message": "Hello! 你好! 🌟 مرحبا",
+					},
+				},
+			},
+		},
 	}
 	for i, step := range steps {
@@ -360,6 +512,42 @@ ls && echo "a > b and a < b"
 	}
 }
+func TestTrailingWhitespaceLenUnicode(t *testing.T) {
+	cases := []struct {
+		name  string
+		input string
+		want  int
+	}{
+		{
+			name:  "ascii space",
+			input: "Hello ",
+			want:  1,
+		},
+		{
+			name:  "non-breaking space",
+			input: "Hello\u00a0",
+			want:  2,
+		},
+		{
+			name:  "ideographic space",
+			input: "Hello\u3000",
+			want:  3,
+		},
+		{
+			name:  "multiple runes of whitespace",
+			input: "Hi\u00a0\u3000",
+			want:  5,
+		},
+	}
+	for _, tc := range cases {
+		got := trailingWhitespaceLen(tc.input)
+		if got != tc.want {
+			t.Errorf("%s: trailingWhitespaceLen(%q) = %d, want %d", tc.name, tc.input, got, tc.want)
+		}
+	}
+}
 func TestQwenToolCallValueParsing(t *testing.T) {
 	cases := []struct {
 		desc      string
@@ -867,6 +1055,8 @@ func TestTrailingWhitespaceLen(t *testing.T) {
 		{desc: "trailing whitespace with newlines", s: "abc \n", want: 2},
 		{desc: "only whitespace", s: " \n  ", want: 4},
 		{desc: "leading whitespace doesn't count", s: " \n abc", want: 0},
+		{desc: "unicode with trailing space", s: "测试🎯 ", want: 1},
+		{desc: "unicode with trailing tab and newline", s: "مرحبا\t\n", want: 2},
 	}
 	for _, tc := range cases {
@@ -876,3 +1066,30 @@ func TestTrailingWhitespaceLen(t *testing.T) {
 		}
 	}
 }
+func TestOverlapFunction(t *testing.T) {
+	cases := []struct {
+		desc  string
+		s     string
+		delim string
+		want  int
+	}{
+		{desc: "no overlap", s: "hello", delim: "<tool", want: 0},
+		{desc: "full overlap", s: "hello<tool", delim: "<tool>", want: 5},
+		{desc: "partial overlap", s: "hello<to", delim: "<tool>", want: 3},
+		{desc: "unicode with partial overlap", s: "测试🎯<to", delim: "<tool>", want: 3},
+		{desc: "unicode string with no overlap", s: "مرحبا", delim: "<tool>", want: 0},
+		{desc: "unicode at boundary", s: "世界<", delim: "<tool>", want: 1},
+		{desc: "unicode delimiter single rune", s: "hello🔧", delim: "🔧工具", want: len("🔧")},
+		{desc: "unicode delimiter multiple runes", s: "hello🔧工", delim: "🔧工具", want: len("🔧工")},
+	}
+	for _, tc := range cases {
+		t.Run(tc.desc, func(t *testing.T) {
+			got := overlap(tc.s, tc.delim)
+			if got != tc.want {
+				t.Errorf("overlap(%q, %q) = %d, want %d", tc.s, tc.delim, got, tc.want)
+			}
+		})
+	}
+}