cpuid.hpp 7.32 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#ifndef CK_CPUID_HPP
#define CK_CPUID_HPP

namespace ck {
namespace cpu {

enum cpuid_vendor
{
    cpuid_vendor_intel = 0,
    cpuid_vendor_amd   = 1,
    cpuid_vendor_other = 2,
};

enum cpuid_cache_type
{
    cpuid_cache_type_null    = 0,
    cpuid_cache_type_dcache  = 1,
    cpuid_cache_type_icache  = 2,
    cpuid_cache_type_unified = 3,
};

struct cpuid_raw
{
    uint32_t eax{0};
    uint32_t ebx{0};
    uint32_t ecx{0};
    uint32_t edx{0};
};

struct cpuid_cache_detail
{
    uint32_t size{0};
    uint32_t type{0};
    uint32_t cache_line_size{0};
    uint32_t associativity{0};
    uint32_t sets{0};
    uint32_t partitions{0};
    uint32_t shared_by_procs{0};  // in HT, usually maybe 2 threads per core, hence for L1/L2,
                                  // usually this maybe 2, unless turn of HT
    uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
                                  // sockets on the chip. TODO: may not needed?
    uint32_t flags{0};
};

struct cpuid_cache_hierarchy
{
    cpuid_cache_detail l1i;
    cpuid_cache_detail l1d;
    cpuid_cache_detail l2;
    cpuid_cache_detail l3;
    cpuid_cache_detail l4;
};

static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
{
    // some leaf feature require ecx value.
    // for others, ecx actually not used.
    uint32_t ebx, edx;
    asm __volatile__("mov    %0,  %%eax\n"
                     "mov    %2,  %%ecx\n"
                     "cpuid\n"
                     "mov    %%eax, %0\n"
                     "mov    %%ebx, %1\n"
                     "mov    %%ecx, %2\n"
                     "mov    %%edx, %3\n"
                     : "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
                     : "0"(eax), "2"(ecx));
    return {eax, ebx, ecx, edx};
}

static inline cpuid_vendor cpuid_query_vendor()
{
    cpuid_raw r = cpuid(0, 0);
    if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
       r.ecx == 0x6C65746EU /*ntel*/)
    {
        return cpuid_vendor_intel;
    }
    if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
       r.ecx == 0x444D4163U /*cAMD*/)
    {
        return cpuid_vendor_amd;
    }
    if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
       r.ecx == 0x21726574U /*ter */)
    {
        return cpuid_vendor_amd;
    }
    return cpuid_vendor_other;
}

static inline cpuid_cache_hierarchy cpuid_query_cache()
{
    cpuid_cache_hierarchy cache_hierarchy;
    cpuid_vendor vendor    = cpuid_query_vendor();
    uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;

    for(uint32_t ecx_idx = 0;; ecx_idx++)
    {
        cpuid_raw r         = cpuid(leaf_cache_id, ecx_idx);
        uint32_t cache_type = r.eax & 0x1f;
        if(cache_type == cpuid_cache_type_null)
            break; // Null, no more cache

        uint32_t cache_level           = (r.eax >> 5) & 0x7;
        uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
        uint32_t cache_lpp_cores       = 1 + ((r.eax >> 26) & 0x3f);

        uint32_t cache_line_size     = 1 + (r.ebx & 0xfff);
        uint32_t cache_partitions    = 1 + ((r.ebx >> 12) & 0x3ff);
        uint32_t cache_associativity = 1 + (r.ebx >> 22);

        uint32_t cache_sets = 1 + r.ecx;

        switch(cache_level)
        {
        case 1:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l1d.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l1d.type             = cache_type;
                cache_hierarchy.l1d.cache_line_size  = cache_line_size;
                cache_hierarchy.l1d.associativity    = cache_associativity;
                cache_hierarchy.l1d.sets             = cache_sets;
                cache_hierarchy.l1d.partitions       = cache_partitions;
                cache_hierarchy.l1d.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
            }
            else if(cache_type == cpuid_cache_type_icache)
            {
                cache_hierarchy.l1i.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l1i.type             = cache_type;
                cache_hierarchy.l1i.cache_line_size  = cache_line_size;
                cache_hierarchy.l1i.associativity    = cache_associativity;
                cache_hierarchy.l1i.sets             = cache_sets;
                cache_hierarchy.l1i.partitions       = cache_partitions;
                cache_hierarchy.l1i.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
            }
            break;
        case 2:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l2.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l2.type             = cache_type;
                cache_hierarchy.l2.cache_line_size  = cache_line_size;
                cache_hierarchy.l2.associativity    = cache_associativity;
                cache_hierarchy.l2.sets             = cache_sets;
                cache_hierarchy.l2.partitions       = cache_partitions;
                cache_hierarchy.l2.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
            }
            break;
        case 3:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l3.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l3.type             = cache_type;
                cache_hierarchy.l3.cache_line_size  = cache_line_size;
                cache_hierarchy.l3.associativity    = cache_associativity;
                cache_hierarchy.l3.sets             = cache_sets;
                cache_hierarchy.l3.partitions       = cache_partitions;
                cache_hierarchy.l3.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
            }
            break;
        case 4:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l4.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l4.type             = cache_type;
                cache_hierarchy.l4.cache_line_size  = cache_line_size;
                cache_hierarchy.l4.associativity    = cache_associativity;
                cache_hierarchy.l4.sets             = cache_sets;
                cache_hierarchy.l4.partitions       = cache_partitions;
                cache_hierarchy.l4.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
            }
            break;
        }
    }

    return cache_hierarchy;
}

} // namespace cpu
} // namespace ck
#endif