gen-unicode-data.py 6.28 KB
Newer Older
xuxzh1's avatar
init  
xuxzh1 committed
1
from __future__ import annotations
mashun1's avatar
v1  
mashun1 committed
2

xuxzh1's avatar
init  
xuxzh1 committed
3
4
5
import array
import unicodedata
import requests
mashun1's avatar
v1  
mashun1 committed
6
7
8
9


MAX_CODEPOINTS = 0x110000

xuxzh1's avatar
init  
xuxzh1 committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"


# see https://www.unicode.org/L2/L1999/UnicodeData.html
def unicode_data_iter():
    res = requests.get(UNICODE_DATA_URL)
    res.raise_for_status()
    data = res.content.decode()

    prev = []

    for line in data.splitlines():
        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
        line = line.split(";")

        cpt = int(line[0], base=16)
        assert cpt < MAX_CODEPOINTS

        cpt_lower = int(line[-2] or "0", base=16)
        assert cpt_lower < MAX_CODEPOINTS

        cpt_upper = int(line[-3] or "0", base=16)
        assert cpt_upper < MAX_CODEPOINTS

        categ = line[2].strip()
        assert len(categ) == 2

        bidir = line[4].strip()
        assert len(categ) == 2

        name = line[1]
        if name.endswith(", First>"):
            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
            continue
        if name.endswith(", Last>"):
            assert prev[1:] == (0, 0, categ, bidir)
            for c in range(prev[0], cpt):
                yield (c, cpt_lower, cpt_upper, categ, bidir)

        yield (cpt, cpt_lower, cpt_upper, categ, bidir)


# see definition in unicode.h
CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}

UNICODE_CATEGORY_TO_FLAG = {
    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
}


codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
mashun1's avatar
v1  
mashun1 committed
98
99
100
101
102
table_whitespace = []
table_lowercase = []
table_uppercase = []
table_nfd = []

xuxzh1's avatar
init  
xuxzh1 committed
103
for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
mashun1's avatar
v1  
mashun1 committed
104
    # convert codepoint to unicode character
xuxzh1's avatar
init  
xuxzh1 committed
105
106
107
108
    char = chr(cpt)

    # codepoint category flags
    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
mashun1's avatar
v1  
mashun1 committed
109
110

    # lowercase conversion
xuxzh1's avatar
init  
xuxzh1 committed
111
112
    if cpt_lower:
        table_lowercase.append((cpt, cpt_lower))
mashun1's avatar
v1  
mashun1 committed
113
114

    # uppercase conversion
xuxzh1's avatar
init  
xuxzh1 committed
115
116
    if cpt_upper:
        table_uppercase.append((cpt, cpt_upper))
mashun1's avatar
v1  
mashun1 committed
117
118
119

    # NFD normalization
    norm = ord(unicodedata.normalize('NFD', char)[0])
xuxzh1's avatar
init  
xuxzh1 committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    if cpt != norm:
        table_nfd.append((cpt, norm))


# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
table_whitespace.extend(range(0x0009, 0x000D + 1))
table_whitespace.extend(range(0x2000, 0x200A + 1))
table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])


# sort by codepoint
table_whitespace.sort()
table_lowercase.sort()
table_uppercase.sort()
table_nfd.sort()
mashun1's avatar
v1  
mashun1 committed
135
136
137


# group ranges with same flags
xuxzh1's avatar
init  
xuxzh1 committed
138
ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])]  # start, flags
mashun1's avatar
v1  
mashun1 committed
139
for codepoint, flags in enumerate(codepoint_flags):
xuxzh1's avatar
init  
xuxzh1 committed
140
    if flags != ranges_flags[-1][1]:
mashun1's avatar
v1  
mashun1 committed
141
        ranges_flags.append((codepoint, flags))
xuxzh1's avatar
init  
xuxzh1 committed
142
ranges_flags.append((MAX_CODEPOINTS, 0x0000))
mashun1's avatar
v1  
mashun1 committed
143
144
145


# group ranges with same nfd
xuxzh1's avatar
init  
xuxzh1 committed
146
ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)]  # start, last, nfd
mashun1's avatar
v1  
mashun1 committed
147
148
149
for codepoint, norm in table_nfd:
    start = ranges_nfd[-1][0]
    if ranges_nfd[-1] != (start, codepoint - 1, norm):
xuxzh1's avatar
init  
xuxzh1 committed
150
        ranges_nfd.append(None)  # type: ignore[arg-type]  # dummy, will be replaced below
mashun1's avatar
v1  
mashun1 committed
151
152
153
154
        start = codepoint
    ranges_nfd[-1] = (start, codepoint, norm)


xuxzh1's avatar
init  
xuxzh1 committed
155
156
# Generate 'unicode-data.cpp':
#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
mashun1's avatar
v1  
mashun1 committed
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

def out(line=""):
    print(line, end='\n')  # noqa


out("""\
// generated with scripts/gen-unicode-data.py

#include "unicode-data.h"

#include <cstdint>
#include <vector>
#include <unordered_map>
#include <unordered_set>
""")

out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
for codepoint, flags in ranges_flags:
    out("{0x%06X, 0x%04X}," % (codepoint, flags))
out("};\n")

out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
xuxzh1's avatar
init  
xuxzh1 committed
179
180
for codepoint in table_whitespace:
    out("0x%06X," % codepoint)
mashun1's avatar
v1  
mashun1 committed
181
182
183
out("};\n")

out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
xuxzh1's avatar
init  
xuxzh1 committed
184
185
for tuple_lw in table_lowercase:
    out("{0x%06X, 0x%06X}," % tuple_lw)
mashun1's avatar
v1  
mashun1 committed
186
187
188
out("};\n")

out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
xuxzh1's avatar
init  
xuxzh1 committed
189
190
for tuple_up in table_uppercase:
    out("{0x%06X, 0x%06X}," % tuple_up)
mashun1's avatar
v1  
mashun1 committed
191
192
193
194
195
196
out("};\n")

out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
for triple in ranges_nfd:
    out("{0x%06X, 0x%06X, 0x%06X}," % triple)
out("};\n")