Commit 35f95fe9 authored by carlushuang's avatar carlushuang
Browse files

movaps->movups, and support loop over L1

parent e72c0c43
#ifndef CK_THREADWISE_PARAM_HPP #ifndef CK_THREADWISE_PARAM_HPP
#define CK_THREADWISE_PARAM_HPP #define CK_THREADWISE_PARAM_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "math.hpp" #include "math.hpp"
namespace ck { namespace ck {
namespace cpu { namespace cpu {
struct ThreadwiseGemmParam struct ThreadwiseGemmParam
{ {
const void* p_a; const void* p_a;
const void* p_b; const void* p_b;
void* p_c; void* p_c;
uint64_t Kr; uint64_t Kr;
uint64_t lda; // in unit of byte uint64_t lda; // in unit of byte
uint64_t ldb; // in unit of byte uint64_t ldb; // in unit of byte
uint64_t ldc; // in unit of byte uint64_t ldc; // in unit of byte
float alpha; float alpha;
uint32_t _pack0; uint32_t _pack0;
} __attribute__((packed)); } __attribute__((packed));
} // namespace cpu } // namespace cpu
} // namespace ck } // namespace ck
#endif #endif
#ifndef CK_CPUID_HPP #ifndef CK_CPUID_HPP
#define CK_CPUID_HPP #define CK_CPUID_HPP
namespace ck { namespace ck {
namespace cpu { namespace cpu {
enum cpuid_vendor enum cpuid_vendor
{ {
cpuid_vendor_intel = 0, cpuid_vendor_intel = 0,
cpuid_vendor_amd = 1, cpuid_vendor_amd = 1,
cpuid_vendor_other = 2, cpuid_vendor_other = 2,
}; };
enum cpuid_cache_type enum cpuid_cache_type
{ {
cpuid_cache_type_null = 0, cpuid_cache_type_null = 0,
cpuid_cache_type_dcache = 1, cpuid_cache_type_dcache = 1,
cpuid_cache_type_icache = 2, cpuid_cache_type_icache = 2,
cpuid_cache_type_unified = 3, cpuid_cache_type_unified = 3,
}; };
struct cpuid_raw struct cpuid_raw
{ {
uint32_t eax{0}; uint32_t eax{0};
uint32_t ebx{0}; uint32_t ebx{0};
uint32_t ecx{0}; uint32_t ecx{0};
uint32_t edx{0}; uint32_t edx{0};
}; };
struct cpuid_cache_detail struct cpuid_cache_detail
{ {
uint32_t size{0}; uint32_t size{0};
uint32_t type{0}; uint32_t type{0};
uint32_t cache_line_size{0}; uint32_t cache_line_size{0};
uint32_t associativity{0}; uint32_t associativity{0};
uint32_t sets{0}; uint32_t sets{0};
uint32_t partitions{0}; uint32_t partitions{0};
uint32_t shared_by_procs{0}; // in HT, usually maybe 2 threads per core, hence for L1/L2, uint32_t shared_by_procs{0}; // in HT, usually maybe 2 threads per core, hence for L1/L2,
// usually this maybe 2, unless turn of HT // usually this maybe 2, unless turn of HT
uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
// sockets on the chip. TODO: may not needed? // sockets on the chip. TODO: may not needed?
uint32_t flags{0}; uint32_t flags{0};
}; };
struct cpuid_cache_hierarchy struct cpuid_cache_hierarchy
{ {
cpuid_cache_detail l1i; cpuid_cache_detail l1i;
cpuid_cache_detail l1d; cpuid_cache_detail l1d;
cpuid_cache_detail l2; cpuid_cache_detail l2;
cpuid_cache_detail l3; cpuid_cache_detail l3;
cpuid_cache_detail l4; cpuid_cache_detail l4;
}; };
static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx) static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
{ {
// some leaf feature require ecx value. // some leaf feature require ecx value.
// for others, ecx actually not used. // for others, ecx actually not used.
uint32_t ebx, edx; uint32_t ebx, edx;
asm __volatile__("mov %0, %%eax\n" asm __volatile__("mov %0, %%eax\n"
"mov %2, %%ecx\n" "mov %2, %%ecx\n"
"cpuid\n" "cpuid\n"
"mov %%eax, %0\n" "mov %%eax, %0\n"
"mov %%ebx, %1\n" "mov %%ebx, %1\n"
"mov %%ecx, %2\n" "mov %%ecx, %2\n"
"mov %%edx, %3\n" "mov %%edx, %3\n"
: "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx) : "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
: "0"(eax), "2"(ecx)); : "0"(eax), "2"(ecx));
return {eax, ebx, ecx, edx}; return {eax, ebx, ecx, edx};
} }
static inline cpuid_vendor cpuid_query_vendor() static inline cpuid_vendor cpuid_query_vendor()
{ {
cpuid_raw r = cpuid(0, 0); cpuid_raw r = cpuid(0, 0);
if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ && if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
r.ecx == 0x6C65746EU /*ntel*/) r.ecx == 0x6C65746EU /*ntel*/)
{ {
return cpuid_vendor_intel; return cpuid_vendor_intel;
} }
if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ && if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
r.ecx == 0x444D4163U /*cAMD*/) r.ecx == 0x444D4163U /*cAMD*/)
{ {
return cpuid_vendor_amd; return cpuid_vendor_amd;
} }
if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ && if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
r.ecx == 0x21726574U /*ter */) r.ecx == 0x21726574U /*ter */)
{ {
return cpuid_vendor_amd; return cpuid_vendor_amd;
} }
return cpuid_vendor_other; return cpuid_vendor_other;
} }
static inline cpuid_cache_hierarchy cpuid_query_cache() static inline cpuid_cache_hierarchy cpuid_query_cache()
{ {
cpuid_cache_hierarchy cache_hierarchy; cpuid_cache_hierarchy cache_hierarchy;
cpuid_vendor vendor = cpuid_query_vendor(); cpuid_vendor vendor = cpuid_query_vendor();
uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4; uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;
for(uint32_t ecx_idx = 0;; ecx_idx++) for(uint32_t ecx_idx = 0;; ecx_idx++)
{ {
cpuid_raw r = cpuid(leaf_cache_id, ecx_idx); cpuid_raw r = cpuid(leaf_cache_id, ecx_idx);
uint32_t cache_type = r.eax & 0x1f; uint32_t cache_type = r.eax & 0x1f;
if(cache_type == cpuid_cache_type_null) if(cache_type == cpuid_cache_type_null)
break; // Null, no more cache break; // Null, no more cache
uint32_t cache_level = (r.eax >> 5) & 0x7; uint32_t cache_level = (r.eax >> 5) & 0x7;
uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff); uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
uint32_t cache_lpp_cores = 1 + ((r.eax >> 26) & 0x3f); uint32_t cache_lpp_cores = 1 + ((r.eax >> 26) & 0x3f);
uint32_t cache_line_size = 1 + (r.ebx & 0xfff); uint32_t cache_line_size = 1 + (r.ebx & 0xfff);
uint32_t cache_partitions = 1 + ((r.ebx >> 12) & 0x3ff); uint32_t cache_partitions = 1 + ((r.ebx >> 12) & 0x3ff);
uint32_t cache_associativity = 1 + (r.ebx >> 22); uint32_t cache_associativity = 1 + (r.ebx >> 22);
uint32_t cache_sets = 1 + r.ecx; uint32_t cache_sets = 1 + r.ecx;
switch(cache_level) switch(cache_level)
{ {
case 1: case 1:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{ {
cache_hierarchy.l1d.size = cache_hierarchy.l1d.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l1d.type = cache_type; cache_hierarchy.l1d.type = cache_type;
cache_hierarchy.l1d.cache_line_size = cache_line_size; cache_hierarchy.l1d.cache_line_size = cache_line_size;
cache_hierarchy.l1d.associativity = cache_associativity; cache_hierarchy.l1d.associativity = cache_associativity;
cache_hierarchy.l1d.sets = cache_sets; cache_hierarchy.l1d.sets = cache_sets;
cache_hierarchy.l1d.partitions = cache_partitions; cache_hierarchy.l1d.partitions = cache_partitions;
cache_hierarchy.l1d.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l1d.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores; cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
} }
else if(cache_type == cpuid_cache_type_icache) else if(cache_type == cpuid_cache_type_icache)
{ {
cache_hierarchy.l1i.size = cache_hierarchy.l1i.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l1i.type = cache_type; cache_hierarchy.l1i.type = cache_type;
cache_hierarchy.l1i.cache_line_size = cache_line_size; cache_hierarchy.l1i.cache_line_size = cache_line_size;
cache_hierarchy.l1i.associativity = cache_associativity; cache_hierarchy.l1i.associativity = cache_associativity;
cache_hierarchy.l1i.sets = cache_sets; cache_hierarchy.l1i.sets = cache_sets;
cache_hierarchy.l1i.partitions = cache_partitions; cache_hierarchy.l1i.partitions = cache_partitions;
cache_hierarchy.l1i.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l1i.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores; cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
} }
break; break;
case 2: case 2:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{ {
cache_hierarchy.l2.size = cache_hierarchy.l2.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l2.type = cache_type; cache_hierarchy.l2.type = cache_type;
cache_hierarchy.l2.cache_line_size = cache_line_size; cache_hierarchy.l2.cache_line_size = cache_line_size;
cache_hierarchy.l2.associativity = cache_associativity; cache_hierarchy.l2.associativity = cache_associativity;
cache_hierarchy.l2.sets = cache_sets; cache_hierarchy.l2.sets = cache_sets;
cache_hierarchy.l2.partitions = cache_partitions; cache_hierarchy.l2.partitions = cache_partitions;
cache_hierarchy.l2.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l2.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l2.cores_per_socket = cache_lpp_cores; cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
} }
break; break;
case 3: case 3:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{ {
cache_hierarchy.l3.size = cache_hierarchy.l3.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l3.type = cache_type; cache_hierarchy.l3.type = cache_type;
cache_hierarchy.l3.cache_line_size = cache_line_size; cache_hierarchy.l3.cache_line_size = cache_line_size;
cache_hierarchy.l3.associativity = cache_associativity; cache_hierarchy.l3.associativity = cache_associativity;
cache_hierarchy.l3.sets = cache_sets; cache_hierarchy.l3.sets = cache_sets;
cache_hierarchy.l3.partitions = cache_partitions; cache_hierarchy.l3.partitions = cache_partitions;
cache_hierarchy.l3.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l3.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l3.cores_per_socket = cache_lpp_cores; cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
} }
break; break;
case 4: case 4:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified) if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{ {
cache_hierarchy.l4.size = cache_hierarchy.l4.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size; cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l4.type = cache_type; cache_hierarchy.l4.type = cache_type;
cache_hierarchy.l4.cache_line_size = cache_line_size; cache_hierarchy.l4.cache_line_size = cache_line_size;
cache_hierarchy.l4.associativity = cache_associativity; cache_hierarchy.l4.associativity = cache_associativity;
cache_hierarchy.l4.sets = cache_sets; cache_hierarchy.l4.sets = cache_sets;
cache_hierarchy.l4.partitions = cache_partitions; cache_hierarchy.l4.partitions = cache_partitions;
cache_hierarchy.l4.shared_by_procs = cache_shared_by_cores; cache_hierarchy.l4.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l4.cores_per_socket = cache_lpp_cores; cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
} }
break; break;
} }
} }
return cache_hierarchy; return cache_hierarchy;
} }
} // namespace cpu } // namespace cpu
} // namespace ck } // namespace ck
#endif #endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment