Commit 35f95fe9 authored by carlushuang's avatar carlushuang
Browse files

movaps->movups, and support loop over L1

parent e72c0c43
#ifndef CK_THREADWISE_PARAM_HPP
#define CK_THREADWISE_PARAM_HPP
#include "common_header.hpp"
#include "math.hpp"
namespace ck {
namespace cpu {
struct ThreadwiseGemmParam
{
const void* p_a;
const void* p_b;
void* p_c;
uint64_t Kr;
uint64_t lda; // in unit of byte
uint64_t ldb; // in unit of byte
uint64_t ldc; // in unit of byte
float alpha;
uint32_t _pack0;
} __attribute__((packed));
} // namespace cpu
} // namespace ck
#endif
#ifndef CK_THREADWISE_PARAM_HPP
#define CK_THREADWISE_PARAM_HPP
#include "common_header.hpp"
#include "math.hpp"
namespace ck {
namespace cpu {
struct ThreadwiseGemmParam
{
const void* p_a;
const void* p_b;
void* p_c;
uint64_t Kr;
uint64_t lda; // in unit of byte
uint64_t ldb; // in unit of byte
uint64_t ldc; // in unit of byte
float alpha;
uint32_t _pack0;
} __attribute__((packed));
} // namespace cpu
} // namespace ck
#endif
#ifndef CK_CPUID_HPP
#define CK_CPUID_HPP
namespace ck {
namespace cpu {
enum cpuid_vendor
{
cpuid_vendor_intel = 0,
cpuid_vendor_amd = 1,
cpuid_vendor_other = 2,
};
enum cpuid_cache_type
{
cpuid_cache_type_null = 0,
cpuid_cache_type_dcache = 1,
cpuid_cache_type_icache = 2,
cpuid_cache_type_unified = 3,
};
struct cpuid_raw
{
uint32_t eax{0};
uint32_t ebx{0};
uint32_t ecx{0};
uint32_t edx{0};
};
struct cpuid_cache_detail
{
uint32_t size{0};
uint32_t type{0};
uint32_t cache_line_size{0};
uint32_t associativity{0};
uint32_t sets{0};
uint32_t partitions{0};
uint32_t shared_by_procs{0}; // in HT, usually maybe 2 threads per core, hence for L1/L2,
// usually this maybe 2, unless turn of HT
uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
// sockets on the chip. TODO: may not needed?
uint32_t flags{0};
};
struct cpuid_cache_hierarchy
{
cpuid_cache_detail l1i;
cpuid_cache_detail l1d;
cpuid_cache_detail l2;
cpuid_cache_detail l3;
cpuid_cache_detail l4;
};
static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
{
// some leaf feature require ecx value.
// for others, ecx actually not used.
uint32_t ebx, edx;
asm __volatile__("mov %0, %%eax\n"
"mov %2, %%ecx\n"
"cpuid\n"
"mov %%eax, %0\n"
"mov %%ebx, %1\n"
"mov %%ecx, %2\n"
"mov %%edx, %3\n"
: "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
: "0"(eax), "2"(ecx));
return {eax, ebx, ecx, edx};
}
static inline cpuid_vendor cpuid_query_vendor()
{
cpuid_raw r = cpuid(0, 0);
if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
r.ecx == 0x6C65746EU /*ntel*/)
{
return cpuid_vendor_intel;
}
if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
r.ecx == 0x444D4163U /*cAMD*/)
{
return cpuid_vendor_amd;
}
if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
r.ecx == 0x21726574U /*ter */)
{
return cpuid_vendor_amd;
}
return cpuid_vendor_other;
}
static inline cpuid_cache_hierarchy cpuid_query_cache()
{
cpuid_cache_hierarchy cache_hierarchy;
cpuid_vendor vendor = cpuid_query_vendor();
uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;
for(uint32_t ecx_idx = 0;; ecx_idx++)
{
cpuid_raw r = cpuid(leaf_cache_id, ecx_idx);
uint32_t cache_type = r.eax & 0x1f;
if(cache_type == cpuid_cache_type_null)
break; // Null, no more cache
uint32_t cache_level = (r.eax >> 5) & 0x7;
uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
uint32_t cache_lpp_cores = 1 + ((r.eax >> 26) & 0x3f);
uint32_t cache_line_size = 1 + (r.ebx & 0xfff);
uint32_t cache_partitions = 1 + ((r.ebx >> 12) & 0x3ff);
uint32_t cache_associativity = 1 + (r.ebx >> 22);
uint32_t cache_sets = 1 + r.ecx;
switch(cache_level)
{
case 1:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l1d.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l1d.type = cache_type;
cache_hierarchy.l1d.cache_line_size = cache_line_size;
cache_hierarchy.l1d.associativity = cache_associativity;
cache_hierarchy.l1d.sets = cache_sets;
cache_hierarchy.l1d.partitions = cache_partitions;
cache_hierarchy.l1d.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
}
else if(cache_type == cpuid_cache_type_icache)
{
cache_hierarchy.l1i.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l1i.type = cache_type;
cache_hierarchy.l1i.cache_line_size = cache_line_size;
cache_hierarchy.l1i.associativity = cache_associativity;
cache_hierarchy.l1i.sets = cache_sets;
cache_hierarchy.l1i.partitions = cache_partitions;
cache_hierarchy.l1i.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
}
break;
case 2:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l2.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l2.type = cache_type;
cache_hierarchy.l2.cache_line_size = cache_line_size;
cache_hierarchy.l2.associativity = cache_associativity;
cache_hierarchy.l2.sets = cache_sets;
cache_hierarchy.l2.partitions = cache_partitions;
cache_hierarchy.l2.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
}
break;
case 3:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l3.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l3.type = cache_type;
cache_hierarchy.l3.cache_line_size = cache_line_size;
cache_hierarchy.l3.associativity = cache_associativity;
cache_hierarchy.l3.sets = cache_sets;
cache_hierarchy.l3.partitions = cache_partitions;
cache_hierarchy.l3.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
}
break;
case 4:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l4.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l4.type = cache_type;
cache_hierarchy.l4.cache_line_size = cache_line_size;
cache_hierarchy.l4.associativity = cache_associativity;
cache_hierarchy.l4.sets = cache_sets;
cache_hierarchy.l4.partitions = cache_partitions;
cache_hierarchy.l4.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
}
break;
}
}
return cache_hierarchy;
}
} // namespace cpu
} // namespace ck
#endif
#ifndef CK_CPUID_HPP
#define CK_CPUID_HPP
namespace ck {
namespace cpu {
enum cpuid_vendor
{
cpuid_vendor_intel = 0,
cpuid_vendor_amd = 1,
cpuid_vendor_other = 2,
};
enum cpuid_cache_type
{
cpuid_cache_type_null = 0,
cpuid_cache_type_dcache = 1,
cpuid_cache_type_icache = 2,
cpuid_cache_type_unified = 3,
};
struct cpuid_raw
{
uint32_t eax{0};
uint32_t ebx{0};
uint32_t ecx{0};
uint32_t edx{0};
};
struct cpuid_cache_detail
{
uint32_t size{0};
uint32_t type{0};
uint32_t cache_line_size{0};
uint32_t associativity{0};
uint32_t sets{0};
uint32_t partitions{0};
uint32_t shared_by_procs{0}; // in HT, usually maybe 2 threads per core, hence for L1/L2,
// usually this maybe 2, unless turn of HT
uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
// sockets on the chip. TODO: may not needed?
uint32_t flags{0};
};
struct cpuid_cache_hierarchy
{
cpuid_cache_detail l1i;
cpuid_cache_detail l1d;
cpuid_cache_detail l2;
cpuid_cache_detail l3;
cpuid_cache_detail l4;
};
static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
{
// some leaf feature require ecx value.
// for others, ecx actually not used.
uint32_t ebx, edx;
asm __volatile__("mov %0, %%eax\n"
"mov %2, %%ecx\n"
"cpuid\n"
"mov %%eax, %0\n"
"mov %%ebx, %1\n"
"mov %%ecx, %2\n"
"mov %%edx, %3\n"
: "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
: "0"(eax), "2"(ecx));
return {eax, ebx, ecx, edx};
}
static inline cpuid_vendor cpuid_query_vendor()
{
cpuid_raw r = cpuid(0, 0);
if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
r.ecx == 0x6C65746EU /*ntel*/)
{
return cpuid_vendor_intel;
}
if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
r.ecx == 0x444D4163U /*cAMD*/)
{
return cpuid_vendor_amd;
}
if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
r.ecx == 0x21726574U /*ter */)
{
return cpuid_vendor_amd;
}
return cpuid_vendor_other;
}
static inline cpuid_cache_hierarchy cpuid_query_cache()
{
cpuid_cache_hierarchy cache_hierarchy;
cpuid_vendor vendor = cpuid_query_vendor();
uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;
for(uint32_t ecx_idx = 0;; ecx_idx++)
{
cpuid_raw r = cpuid(leaf_cache_id, ecx_idx);
uint32_t cache_type = r.eax & 0x1f;
if(cache_type == cpuid_cache_type_null)
break; // Null, no more cache
uint32_t cache_level = (r.eax >> 5) & 0x7;
uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
uint32_t cache_lpp_cores = 1 + ((r.eax >> 26) & 0x3f);
uint32_t cache_line_size = 1 + (r.ebx & 0xfff);
uint32_t cache_partitions = 1 + ((r.ebx >> 12) & 0x3ff);
uint32_t cache_associativity = 1 + (r.ebx >> 22);
uint32_t cache_sets = 1 + r.ecx;
switch(cache_level)
{
case 1:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l1d.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l1d.type = cache_type;
cache_hierarchy.l1d.cache_line_size = cache_line_size;
cache_hierarchy.l1d.associativity = cache_associativity;
cache_hierarchy.l1d.sets = cache_sets;
cache_hierarchy.l1d.partitions = cache_partitions;
cache_hierarchy.l1d.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
}
else if(cache_type == cpuid_cache_type_icache)
{
cache_hierarchy.l1i.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l1i.type = cache_type;
cache_hierarchy.l1i.cache_line_size = cache_line_size;
cache_hierarchy.l1i.associativity = cache_associativity;
cache_hierarchy.l1i.sets = cache_sets;
cache_hierarchy.l1i.partitions = cache_partitions;
cache_hierarchy.l1i.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
}
break;
case 2:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l2.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l2.type = cache_type;
cache_hierarchy.l2.cache_line_size = cache_line_size;
cache_hierarchy.l2.associativity = cache_associativity;
cache_hierarchy.l2.sets = cache_sets;
cache_hierarchy.l2.partitions = cache_partitions;
cache_hierarchy.l2.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
}
break;
case 3:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l3.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l3.type = cache_type;
cache_hierarchy.l3.cache_line_size = cache_line_size;
cache_hierarchy.l3.associativity = cache_associativity;
cache_hierarchy.l3.sets = cache_sets;
cache_hierarchy.l3.partitions = cache_partitions;
cache_hierarchy.l3.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
}
break;
case 4:
if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
{
cache_hierarchy.l4.size =
cache_partitions * cache_sets * cache_associativity * cache_line_size;
cache_hierarchy.l4.type = cache_type;
cache_hierarchy.l4.cache_line_size = cache_line_size;
cache_hierarchy.l4.associativity = cache_associativity;
cache_hierarchy.l4.sets = cache_sets;
cache_hierarchy.l4.partitions = cache_partitions;
cache_hierarchy.l4.shared_by_procs = cache_shared_by_cores;
cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
}
break;
}
}
return cache_hierarchy;
}
} // namespace cpu
} // namespace ck
#endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment