#ifndef CK_CPUID_HPP #define CK_CPUID_HPP namespace ck { namespace cpu { enum cpuid_vendor { cpuid_vendor_intel = 0, cpuid_vendor_amd = 1, cpuid_vendor_other = 2, }; enum cpuid_cache_type { cpuid_cache_type_null = 0, cpuid_cache_type_dcache = 1, cpuid_cache_type_icache = 2, cpuid_cache_type_unified = 3, }; struct cpuid_raw { uint32_t eax{0}; uint32_t ebx{0}; uint32_t ecx{0}; uint32_t edx{0}; }; struct cpuid_cache_detail { uint32_t size{0}; uint32_t type{0}; uint32_t cache_line_size{0}; uint32_t associativity{0}; uint32_t sets{0}; uint32_t partitions{0}; uint32_t shared_by_procs{0}; // in HT, usually maybe 2 threads per core, hence for L1/L2, // usually this maybe 2, unless turn of HT uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple // sockets on the chip. TODO: may not needed? uint32_t flags{0}; }; struct cpuid_cache_hierarchy { cpuid_cache_detail l1i; cpuid_cache_detail l1d; cpuid_cache_detail l2; cpuid_cache_detail l3; cpuid_cache_detail l4; }; static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx) { // some leaf feature require ecx value. // for others, ecx actually not used. uint32_t ebx, edx; asm __volatile__("mov %0, %%eax\n" "mov %2, %%ecx\n" "cpuid\n" "mov %%eax, %0\n" "mov %%ebx, %1\n" "mov %%ecx, %2\n" "mov %%edx, %3\n" : "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx) : "0"(eax), "2"(ecx)); return {eax, ebx, ecx, edx}; } static inline cpuid_vendor cpuid_query_vendor() { cpuid_raw r = cpuid(0, 0); if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ && r.ecx == 0x6C65746EU /*ntel*/) { return cpuid_vendor_intel; } if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ && r.ecx == 0x444D4163U /*cAMD*/) { return cpuid_vendor_amd; } if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ && r.ecx == 0x21726574U /*ter */) { return cpuid_vendor_amd; } return cpuid_vendor_other; } static inline cpuid_cache_hierarchy cpuid_query_cache() { cpuid_cache_hierarchy cache_hierarchy; cpuid_vendor vendor = cpuid_query_vendor(); uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4; for(uint32_t ecx_idx = 0;; ecx_idx++) { cpuid_raw r = cpuid(leaf_cache_id, ecx_idx); uint32_t cache_type = r.eax & 0x1f; if(cache_type == cpuid_cache_type_null) break; // Null, no more cache uint32_t cache_level = (r.eax >> 5) & 0x7; uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff); uint32_t cache_lpp_cores = 1 + ((r.eax >> 26) & 0x3f); uint32_t cache_line_size = 1 + (r.ebx & 0xfff); uint32_t cache_partitions = 1 + ((r.ebx >> 12) & 0x3ff); uint32_t cache_associativity = 1 + (r.ebx >> 22); uint32_t cache_sets = 1 + r.ecx; switch(cache_level) { case 1: if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) { cache_hierarchy.l1d.size = cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_hierarchy.l1d.type = cache_type; cache_hierarchy.l1d.cache_line_size = cache_line_size; cache_hierarchy.l1d.associativity = cache_associativity; cache_hierarchy.l1d.sets = cache_sets; cache_hierarchy.l1d.partitions = cache_partitions; cache_hierarchy.l1d.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores; } else if(cache_type == cpuid_cache_type_icache) { cache_hierarchy.l1i.size = cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_hierarchy.l1i.type = cache_type; cache_hierarchy.l1i.cache_line_size = cache_line_size; cache_hierarchy.l1i.associativity = cache_associativity; cache_hierarchy.l1i.sets = cache_sets; cache_hierarchy.l1i.partitions = cache_partitions; cache_hierarchy.l1i.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores; } break; case 2: if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) { cache_hierarchy.l2.size = cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_hierarchy.l2.type = cache_type; cache_hierarchy.l2.cache_line_size = cache_line_size; cache_hierarchy.l2.associativity = cache_associativity; cache_hierarchy.l2.sets = cache_sets; cache_hierarchy.l2.partitions = cache_partitions; cache_hierarchy.l2.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l2.cores_per_socket = cache_lpp_cores; } break; case 3: if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) { cache_hierarchy.l3.size = cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_hierarchy.l3.type = cache_type; cache_hierarchy.l3.cache_line_size = cache_line_size; cache_hierarchy.l3.associativity = cache_associativity; cache_hierarchy.l3.sets = cache_sets; cache_hierarchy.l3.partitions = cache_partitions; cache_hierarchy.l3.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l3.cores_per_socket = cache_lpp_cores; } break; case 4: if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) { cache_hierarchy.l4.size = cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_hierarchy.l4.type = cache_type; cache_hierarchy.l4.cache_line_size = cache_line_size; cache_hierarchy.l4.associativity = cache_associativity; cache_hierarchy.l4.sets = cache_sets; cache_hierarchy.l4.partitions = cache_partitions; cache_hierarchy.l4.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l4.cores_per_socket = cache_lpp_cores; } break; } } return cache_hierarchy; } } // namespace cpu } // namespace ck #endif