movaps->movups, and support loop over L1

35f95fe9 · carlushuang · e72c0c43 · 35f95fe9 · 35f95fe9 · 35f95fe9
Commit 35f95fe9 authored Mar 27, 2022 by carlushuang
4 changed files
--- a/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
--- a/include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
-#ifndef CK_THREADWISE_PARAM_HPP
-#define CK_THREADWISE_PARAM_HPP
-
-#include "common_header.hpp"
-#include "math.hpp"
-
-namespace ck {
-namespace cpu {
-
-struct ThreadwiseGemmParam
-{
-    const void* p_a;
-    const void* p_b;
-    void* p_c;
-    uint64_t Kr;
-    uint64_t lda; // in unit of byte
-    uint64_t ldb; // in unit of byte
-    uint64_t ldc; // in unit of byte
-    float alpha;
-    uint32_t _pack0;
-} __attribute__((packed));
-
-} // namespace cpu
-} // namespace ck
-
-#endif
+#ifndef CK_THREADWISE_PARAM_HPP
+#define CK_THREADWISE_PARAM_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+
+namespace ck {
+namespace cpu {
+
+struct ThreadwiseGemmParam
+{
+    const void* p_a;
+    const void* p_b;
+    void* p_c;
+    uint64_t Kr;
+    uint64_t lda; // in unit of byte
+    uint64_t ldb; // in unit of byte
+    uint64_t ldc; // in unit of byte
+    float alpha;
+    uint32_t _pack0;
+} __attribute__((packed));
+
+} // namespace cpu
+} // namespace ck
+
+#endif
--- a/include/ck/utility/cpuid.hpp
+++ b/include/ck/utility/cpuid.hpp
-#ifndef CK_CPUID_HPP
-#define CK_CPUID_HPP
-
-namespace ck {
-namespace cpu {
-
-enum cpuid_vendor
-{
-    cpuid_vendor_intel = 0,
-    cpuid_vendor_amd   = 1,
-    cpuid_vendor_other = 2,
-};
-
-enum cpuid_cache_type
-{
-    cpuid_cache_type_null    = 0,
-    cpuid_cache_type_dcache  = 1,
-    cpuid_cache_type_icache  = 2,
-    cpuid_cache_type_unified = 3,
-};
-
-struct cpuid_raw
-{
-    uint32_t eax{0};
-    uint32_t ebx{0};
-    uint32_t ecx{0};
-    uint32_t edx{0};
-};
-
-struct cpuid_cache_detail
-{
-    uint32_t size{0};
-    uint32_t type{0};
-    uint32_t cache_line_size{0};
-    uint32_t associativity{0};
-    uint32_t sets{0};
-    uint32_t partitions{0};
-    uint32_t shared_by_procs{0};  // in HT, usually maybe 2 threads per core, hence for L1/L2,
-                                  // usually this maybe 2, unless turn of HT
-    uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
-                                  // sockets on the chip. TODO: may not needed?
-    uint32_t flags{0};
-};
-
-struct cpuid_cache_hierarchy
-{
-    cpuid_cache_detail l1i;
-    cpuid_cache_detail l1d;
-    cpuid_cache_detail l2;
-    cpuid_cache_detail l3;
-    cpuid_cache_detail l4;
-};
-
-static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
-{
-    // some leaf feature require ecx value.
-    // for others, ecx actually not used.
-    uint32_t ebx, edx;
-    asm __volatile__("mov    %0,  %%eax\n"
-                     "mov    %2,  %%ecx\n"
-                     "cpuid\n"
-                     "mov    %%eax, %0\n"
-                     "mov    %%ebx, %1\n"
-                     "mov    %%ecx, %2\n"
-                     "mov    %%edx, %3\n"
-                     : "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
-                     : "0"(eax), "2"(ecx));
-    return {eax, ebx, ecx, edx};
-}
-
-static inline cpuid_vendor cpuid_query_vendor()
-{
-    cpuid_raw r = cpuid(0, 0);
-    if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
-       r.ecx == 0x6C65746EU /*ntel*/)
-    {
-        return cpuid_vendor_intel;
-    }
-    if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
-       r.ecx == 0x444D4163U /*cAMD*/)
-    {
-        return cpuid_vendor_amd;
-    }
-    if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
-       r.ecx == 0x21726574U /*ter */)
-    {
-        return cpuid_vendor_amd;
-    }
-    return cpuid_vendor_other;
-}
-
-static inline cpuid_cache_hierarchy cpuid_query_cache()
-{
-    cpuid_cache_hierarchy cache_hierarchy;
-    cpuid_vendor vendor    = cpuid_query_vendor();
-    uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;
-
-    for(uint32_t ecx_idx = 0;; ecx_idx++)
-    {
-        cpuid_raw r         = cpuid(leaf_cache_id, ecx_idx);
-        uint32_t cache_type = r.eax & 0x1f;
-        if(cache_type == cpuid_cache_type_null)
-            break; // Null, no more cache
-
-        uint32_t cache_level           = (r.eax >> 5) & 0x7;
-        uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
-        uint32_t cache_lpp_cores       = 1 + ((r.eax >> 26) & 0x3f);
-
-        uint32_t cache_line_size     = 1 + (r.ebx & 0xfff);
-        uint32_t cache_partitions    = 1 + ((r.ebx >> 12) & 0x3ff);
-        uint32_t cache_associativity = 1 + (r.ebx >> 22);
-
-        uint32_t cache_sets = 1 + r.ecx;
-
-        switch(cache_level)
-        {
-        case 1:
-            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
-            {
-                cache_hierarchy.l1d.size =
-                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
-                cache_hierarchy.l1d.type             = cache_type;
-                cache_hierarchy.l1d.cache_line_size  = cache_line_size;
-                cache_hierarchy.l1d.associativity    = cache_associativity;
-                cache_hierarchy.l1d.sets             = cache_sets;
-                cache_hierarchy.l1d.partitions       = cache_partitions;
-                cache_hierarchy.l1d.shared_by_procs  = cache_shared_by_cores;
-                cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
-            }
-            else if(cache_type == cpuid_cache_type_icache)
-            {
-                cache_hierarchy.l1i.size =
-                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
-                cache_hierarchy.l1i.type             = cache_type;
-                cache_hierarchy.l1i.cache_line_size  = cache_line_size;
-                cache_hierarchy.l1i.associativity    = cache_associativity;
-                cache_hierarchy.l1i.sets             = cache_sets;
-                cache_hierarchy.l1i.partitions       = cache_partitions;
-                cache_hierarchy.l1i.shared_by_procs  = cache_shared_by_cores;
-                cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
-            }
-            break;
-        case 2:
-            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
-            {
-                cache_hierarchy.l2.size =
-                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
-                cache_hierarchy.l2.type             = cache_type;
-                cache_hierarchy.l2.cache_line_size  = cache_line_size;
-                cache_hierarchy.l2.associativity    = cache_associativity;
-                cache_hierarchy.l2.sets             = cache_sets;
-                cache_hierarchy.l2.partitions       = cache_partitions;
-                cache_hierarchy.l2.shared_by_procs  = cache_shared_by_cores;
-                cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
-            }
-            break;
-        case 3:
-            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
-            {
-                cache_hierarchy.l3.size =
-                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
-                cache_hierarchy.l3.type             = cache_type;
-                cache_hierarchy.l3.cache_line_size  = cache_line_size;
-                cache_hierarchy.l3.associativity    = cache_associativity;
-                cache_hierarchy.l3.sets             = cache_sets;
-                cache_hierarchy.l3.partitions       = cache_partitions;
-                cache_hierarchy.l3.shared_by_procs  = cache_shared_by_cores;
-                cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
-            }
-            break;
-        case 4:
-            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
-            {
-                cache_hierarchy.l4.size =
-                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
-                cache_hierarchy.l4.type             = cache_type;
-                cache_hierarchy.l4.cache_line_size  = cache_line_size;
-                cache_hierarchy.l4.associativity    = cache_associativity;
-                cache_hierarchy.l4.sets             = cache_sets;
-                cache_hierarchy.l4.partitions       = cache_partitions;
-                cache_hierarchy.l4.shared_by_procs  = cache_shared_by_cores;
-                cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
-            }
-            break;
-        }
-    }
-
-    return cache_hierarchy;
-}
-
-} // namespace cpu
-} // namespace ck
-#endif
+#ifndef CK_CPUID_HPP
+#define CK_CPUID_HPP
+
+namespace ck {
+namespace cpu {
+
+enum cpuid_vendor
+{
+    cpuid_vendor_intel = 0,
+    cpuid_vendor_amd   = 1,
+    cpuid_vendor_other = 2,
+};
+
+enum cpuid_cache_type
+{
+    cpuid_cache_type_null    = 0,
+    cpuid_cache_type_dcache  = 1,
+    cpuid_cache_type_icache  = 2,
+    cpuid_cache_type_unified = 3,
+};
+
+struct cpuid_raw
+{
+    uint32_t eax{0};
+    uint32_t ebx{0};
+    uint32_t ecx{0};
+    uint32_t edx{0};
+};
+
+struct cpuid_cache_detail
+{
+    uint32_t size{0};
+    uint32_t type{0};
+    uint32_t cache_line_size{0};
+    uint32_t associativity{0};
+    uint32_t sets{0};
+    uint32_t partitions{0};
+    uint32_t shared_by_procs{0};  // in HT, usually maybe 2 threads per core, hence for L1/L2,
+                                  // usually this maybe 2, unless turn of HT
+    uint32_t cores_per_socket{0}; // hardware cores in a physical socket. there maybe multiple
+                                  // sockets on the chip. TODO: may not needed?
+    uint32_t flags{0};
+};
+
+struct cpuid_cache_hierarchy
+{
+    cpuid_cache_detail l1i;
+    cpuid_cache_detail l1d;
+    cpuid_cache_detail l2;
+    cpuid_cache_detail l3;
+    cpuid_cache_detail l4;
+};
+
+static inline cpuid_raw cpuid(uint32_t eax, uint32_t ecx)
+{
+    // some leaf feature require ecx value.
+    // for others, ecx actually not used.
+    uint32_t ebx, edx;
+    asm __volatile__("mov    %0,  %%eax\n"
+                     "mov    %2,  %%ecx\n"
+                     "cpuid\n"
+                     "mov    %%eax, %0\n"
+                     "mov    %%ebx, %1\n"
+                     "mov    %%ecx, %2\n"
+                     "mov    %%edx, %3\n"
+                     : "=r"(eax), "=r"(ebx), "=r"(ecx), "=r"(edx)
+                     : "0"(eax), "2"(ecx));
+    return {eax, ebx, ecx, edx};
+}
+
+static inline cpuid_vendor cpuid_query_vendor()
+{
+    cpuid_raw r = cpuid(0, 0);
+    if(r.ebx == 0x756E6547U /*Genu*/ && r.edx == 0x49656E69U /*ineI*/ &&
+       r.ecx == 0x6C65746EU /*ntel*/)
+    {
+        return cpuid_vendor_intel;
+    }
+    if(r.ebx == 0x68747541U /*Auth*/ && r.edx == 0x74656273U /*enti*/ &&
+       r.ecx == 0x444D4163U /*cAMD*/)
+    {
+        return cpuid_vendor_amd;
+    }
+    if(r.ebx == 0x69444D41U /*AMDi*/ && r.edx == 0x69746E65U /*sbet*/ &&
+       r.ecx == 0x21726574U /*ter */)
+    {
+        return cpuid_vendor_amd;
+    }
+    return cpuid_vendor_other;
+}
+
+static inline cpuid_cache_hierarchy cpuid_query_cache()
+{
+    cpuid_cache_hierarchy cache_hierarchy;
+    cpuid_vendor vendor    = cpuid_query_vendor();
+    uint32_t leaf_cache_id = vendor == cpuid_vendor_amd ? 0x8000001d : 0x4;
+
+    for(uint32_t ecx_idx = 0;; ecx_idx++)
+    {
+        cpuid_raw r         = cpuid(leaf_cache_id, ecx_idx);
+        uint32_t cache_type = r.eax & 0x1f;
+        if(cache_type == cpuid_cache_type_null)
+            break; // Null, no more cache
+
+        uint32_t cache_level           = (r.eax >> 5) & 0x7;
+        uint32_t cache_shared_by_cores = 1 + ((r.eax >> 14) & 0xfff);
+        uint32_t cache_lpp_cores       = 1 + ((r.eax >> 26) & 0x3f);
+
+        uint32_t cache_line_size     = 1 + (r.ebx & 0xfff);
+        uint32_t cache_partitions    = 1 + ((r.ebx >> 12) & 0x3ff);
+        uint32_t cache_associativity = 1 + (r.ebx >> 22);
+
+        uint32_t cache_sets = 1 + r.ecx;
+
+        switch(cache_level)
+        {
+        case 1:
+            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
+            {
+                cache_hierarchy.l1d.size =
+                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
+                cache_hierarchy.l1d.type             = cache_type;
+                cache_hierarchy.l1d.cache_line_size  = cache_line_size;
+                cache_hierarchy.l1d.associativity    = cache_associativity;
+                cache_hierarchy.l1d.sets             = cache_sets;
+                cache_hierarchy.l1d.partitions       = cache_partitions;
+                cache_hierarchy.l1d.shared_by_procs  = cache_shared_by_cores;
+                cache_hierarchy.l1d.cores_per_socket = cache_lpp_cores;
+            }
+            else if(cache_type == cpuid_cache_type_icache)
+            {
+                cache_hierarchy.l1i.size =
+                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
+                cache_hierarchy.l1i.type             = cache_type;
+                cache_hierarchy.l1i.cache_line_size  = cache_line_size;
+                cache_hierarchy.l1i.associativity    = cache_associativity;
+                cache_hierarchy.l1i.sets             = cache_sets;
+                cache_hierarchy.l1i.partitions       = cache_partitions;
+                cache_hierarchy.l1i.shared_by_procs  = cache_shared_by_cores;
+                cache_hierarchy.l1i.cores_per_socket = cache_lpp_cores;
+            }
+            break;
+        case 2:
+            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
+            {
+                cache_hierarchy.l2.size =
+                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
+                cache_hierarchy.l2.type             = cache_type;
+                cache_hierarchy.l2.cache_line_size  = cache_line_size;
+                cache_hierarchy.l2.associativity    = cache_associativity;
+                cache_hierarchy.l2.sets             = cache_sets;
+                cache_hierarchy.l2.partitions       = cache_partitions;
+                cache_hierarchy.l2.shared_by_procs  = cache_shared_by_cores;
+                cache_hierarchy.l2.cores_per_socket = cache_lpp_cores;
+            }
+            break;
+        case 3:
+            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
+            {
+                cache_hierarchy.l3.size =
+                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
+                cache_hierarchy.l3.type             = cache_type;
+                cache_hierarchy.l3.cache_line_size  = cache_line_size;
+                cache_hierarchy.l3.associativity    = cache_associativity;
+                cache_hierarchy.l3.sets             = cache_sets;
+                cache_hierarchy.l3.partitions       = cache_partitions;
+                cache_hierarchy.l3.shared_by_procs  = cache_shared_by_cores;
+                cache_hierarchy.l3.cores_per_socket = cache_lpp_cores;
+            }
+            break;
+        case 4:
+            if(cache_type == cpuid_cache_type_dcache || cache_type == cpuid_cache_type_unified)
+            {
+                cache_hierarchy.l4.size =
+                    cache_partitions * cache_sets * cache_associativity * cache_line_size;
+                cache_hierarchy.l4.type             = cache_type;
+                cache_hierarchy.l4.cache_line_size  = cache_line_size;
+                cache_hierarchy.l4.associativity    = cache_associativity;
+                cache_hierarchy.l4.sets             = cache_sets;
+                cache_hierarchy.l4.partitions       = cache_partitions;
+                cache_hierarchy.l4.shared_by_procs  = cache_shared_by_cores;
+                cache_hierarchy.l4.cores_per_socket = cache_lpp_cores;
+            }
+            break;
+        }
+    }
+
+    return cache_hierarchy;
+}
+
+} // namespace cpu
+} // namespace ck
+#endif
--- a/test/cpu_ukernel/cpu_gemm_uk.cpp
+++ b/test/cpu_ukernel/cpu_gemm_uk.cpp