movaps->movups, and support loop over L1

35f95fe9 · carlushuang · e72c0c43 · 35f95fe9 · 35f95fe9 · 35f95fe9
Commit 35f95fe9 authored Mar 27, 2022 by carlushuang
4 changed files
--- a/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
--- a/include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
 #ifndef CK_THREADWISE_PARAM_HPP
 #define CK_THREADWISE_PARAM_HPP
 #include "common_header.hpp"
 #include "math.hpp"
 namespace ck {
 namespace cpu {
 struct ThreadwiseGemmParam
 {
    const void* p_a;
    const void* p_b;
    void* p_c;
    uint64_t Kr;
    uint64_t lda; // in unit of byte
    uint64_t ldb; // in unit of byte
    uint64_t ldc; // in unit of byte
    float alpha;
    uint32_t _pack0;
 } __attribute__((packed));
 } // namespace cpu
 } // namespace ck
 #endif
--- a/include/ck/utility/cpuid.hpp
+++ b/include/ck/utility/cpuid.hpp
 #ifndef CK_CPUID_HPP
 #define CK_CPUID_HPP
 namespace ck {
 namespace cpu {
 enum cpuid_vendor
 {
    cpuid_vendor_intel = 0,
    cpuid_vendor_amd   = 1,
    cpuid_vendor_other = 2,
 };
 enum cpuid_cache_type
 {
    cpuid_cache_type_null    = 0,
    cpuid_cache_type_dcache  = 1,
    cpuid_cache_type_icache  = 2,
    cpuid_cache_type_unified = 3,
 };
 struct cpuid_raw
 {
    uint32_t eax{0};
    uint32_t ebx{0};
    uint32_t ecx{0};
    uint32_t edx{0};
 };
 struct cpuid_cache_detail
 {
    uint32_t size{0};
    uint32_t type{0};
    uint32_t cache_line_size{0};
    uint32_t associativity{0};
    uint32_t sets{0};
    uint32_t partitions{0};
    uint32_t shared_by_procs{0};  // in HT, usually maybe 2 threads per core, hence for L1/L2,
                                  // usually this maybe 2, unless turn of HT
    uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
                                  // sockets on the chip. TODO: may not needed?
    uint32_t flags{0};
 };
 struct cpuid_cache_hierarchy
 {
    cpuid_cache_detail l1i;
    cpuid_cache_detail l1d;
    cpuid_cache_detail l2;
    cpuid_cache_detail l3;
    cpuid_cache_detail l4;
 };
 static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
 {
    // some leaf feature require ecx value.
    // for others, ecx actually not used.
    uint32_t ebx, edx;
    asm __volatile__("mov    %0,  %%eax\n"
                     "mov    %2,  %%ecx\n"
                     "cpuid\n"
                     "mov    %%eax, %0\n"
                     "mov    %%ebx, %1\n"
                     "mov    %%ecx, %2\n"
                     "mov    %%edx, %3\n"
                     : "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
                     : "0"(eax), "2"(ecx));
    return {eax, ebx, ecx, edx};
 }
 static inline cpuid_vendor cpuid_query_vendor()
 {
    cpuid_raw r = cpuid(0, 0);
    if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
       r.ecx == 0x6C65746EU /*ntel*/)
    {
        return cpuid_vendor_intel;
    }
    if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
       r.ecx == 0x444D4163U /*cAMD*/)
    {
        return cpuid_vendor_amd;
    }
    if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
       r.ecx == 0x21726574U /*ter */)
    {
        return cpuid_vendor_amd;
    }
    return cpuid_vendor_other;
 }
 static inline cpuid_cache_hierarchy cpuid_query_cache()
 {
    cpuid_cache_hierarchy cache_hierarchy;
    cpuid_vendor vendor    = cpuid_query_vendor();
    uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;
    for(uint32_t ecx_idx = 0;; ecx_idx++)
    {
        cpuid_raw r         = cpuid(leaf_cache_id, ecx_idx);
        uint32_t cache_type = r.eax & 0x1f;
        if(cache_type == cpuid_cache_type_null)
            break; // Null, no more cache
        uint32_t cache_level           = (r.eax >> 5) & 0x7;
        uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
        uint32_t cache_lpp_cores       = 1 + ((r.eax >> 26) & 0x3f);
        uint32_t cache_line_size     = 1 + (r.ebx & 0xfff);
        uint32_t cache_partitions    = 1 + ((r.ebx >> 12) & 0x3ff);
        uint32_t cache_associativity = 1 + (r.ebx >> 22);
        uint32_t cache_sets = 1 + r.ecx;
        switch(cache_level)
        {
        case 1:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l1d.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l1d.type             = cache_type;
                cache_hierarchy.l1d.cache_line_size  = cache_line_size;
                cache_hierarchy.l1d.associativity    = cache_associativity;
                cache_hierarchy.l1d.sets             = cache_sets;
                cache_hierarchy.l1d.partitions       = cache_partitions;
                cache_hierarchy.l1d.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
            }
            else if(cache_type == cpuid_cache_type_icache)
            {
                cache_hierarchy.l1i.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l1i.type             = cache_type;
                cache_hierarchy.l1i.cache_line_size  = cache_line_size;
                cache_hierarchy.l1i.associativity    = cache_associativity;
                cache_hierarchy.l1i.sets             = cache_sets;
                cache_hierarchy.l1i.partitions       = cache_partitions;
                cache_hierarchy.l1i.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
            }
            break;
        case 2:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l2.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l2.type             = cache_type;
                cache_hierarchy.l2.cache_line_size  = cache_line_size;
                cache_hierarchy.l2.associativity    = cache_associativity;
                cache_hierarchy.l2.sets             = cache_sets;
                cache_hierarchy.l2.partitions       = cache_partitions;
                cache_hierarchy.l2.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
            }
            break;
        case 3:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l3.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l3.type             = cache_type;
                cache_hierarchy.l3.cache_line_size  = cache_line_size;
                cache_hierarchy.l3.associativity    = cache_associativity;
                cache_hierarchy.l3.sets             = cache_sets;
                cache_hierarchy.l3.partitions       = cache_partitions;
                cache_hierarchy.l3.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
            }
            break;
        case 4:
            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
            {
                cache_hierarchy.l4.size =
                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
                cache_hierarchy.l4.type             = cache_type;
                cache_hierarchy.l4.cache_line_size  = cache_line_size;
                cache_hierarchy.l4.associativity    = cache_associativity;
                cache_hierarchy.l4.sets             = cache_sets;
                cache_hierarchy.l4.partitions       = cache_partitions;
                cache_hierarchy.l4.shared_by_procs  = cache_shared_by_cores;
                cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
            }
            break;
        }
    }
    return cache_hierarchy;
 }
 } // namespace cpu
 } // namespace ck
 #endif
--- a/test/cpu_ukernel/cpu_gemm_uk.cpp
+++ b/test/cpu_ukernel/cpu_gemm_uk.cpp