0008-fix-deepseek-deseret-regex.patch 3.31 KB
Newer Older
1
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2
3
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 19:43:06 -0700
4
5
Subject: [PATCH] fix deepseek deseret regex

6
7
8
on some systems, deepseek's regex would throw an error
on windows due to the deseret characters in the matching
regex
9
10
---
 src/llama-vocab.cpp |  2 +-
11
12
 src/unicode.cpp     | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)
13
14

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
15
index a9ee9f03..1306864e 100644
16
17
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
18
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
19
20
21
22
23
24
25
26
27
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",
-                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
+                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
                     "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
28
index e63bb4ab..73cb2b1a 100644
29
30
31
32
33
34
35
36
37
38
39
40
41
42
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
 #include "unicode.h"
 #include "unicode-data.h"
 
43
@@ -200,6 +205,21 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
+    if (!wlen) {
+        throw std::invalid_argument("failed to convert regex");
+    }
+    wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+    wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
+    if (!wlen) {
+        free(wbuf);
+        throw std::invalid_argument("failed to convert regex");
+    }
+    std::wstring ret = std::wstring(wbuf);
+    free(wbuf);
+    return ret;
+#else
62
63
64
 #if defined(__clang__)
     // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
65
@@ -213,6 +233,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
66
67
 #endif
 
68
69
70
71
72
     return conv.from_bytes(s);
+#endif
 }
 
 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {