Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
35f95fe9
Commit
35f95fe9
authored
Mar 27, 2022
by
carlushuang
Browse files
movaps->movups, and support loop over L1
parent
e72c0c43
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
911 additions
and
839 deletions
+911
-839
include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
...e/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+305
-305
include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
+26
-26
include/ck/utility/cpuid.hpp
include/ck/utility/cpuid.hpp
+193
-193
test/cpu_ukernel/cpu_gemm_uk.cpp
test/cpu_ukernel/cpu_gemm_uk.cpp
+387
-315
No files found.
include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
View file @
35f95fe9
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/cpu/thread/threadwise_param.hpp
View file @
35f95fe9
#ifndef CK_THREADWISE_PARAM_HPP
#define CK_THREADWISE_PARAM_HPP
#include "common_header.hpp"
#include "math.hpp"
namespace
ck
{
namespace
cpu
{
struct
ThreadwiseGemmParam
{
const
void
*
p_a
;
const
void
*
p_b
;
void
*
p_c
;
uint64_t
Kr
;
uint64_t
lda
;
// in unit of byte
uint64_t
ldb
;
// in unit of byte
uint64_t
ldc
;
// in unit of byte
float
alpha
;
uint32_t
_pack0
;
}
__attribute__
((
packed
));
}
// namespace cpu
}
// namespace ck
#endif
#ifndef CK_THREADWISE_PARAM_HPP
#define CK_THREADWISE_PARAM_HPP
#include "common_header.hpp"
#include "math.hpp"
namespace
ck
{
namespace
cpu
{
struct
ThreadwiseGemmParam
{
const
void
*
p_a
;
const
void
*
p_b
;
void
*
p_c
;
uint64_t
Kr
;
uint64_t
lda
;
// in unit of byte
uint64_t
ldb
;
// in unit of byte
uint64_t
ldc
;
// in unit of byte
float
alpha
;
uint32_t
_pack0
;
}
__attribute__
((
packed
));
}
// namespace cpu
}
// namespace ck
#endif
include/ck/utility/cpuid.hpp
View file @
35f95fe9
#ifndef CK_CPUID_HPP
#define CK_CPUID_HPP
namespace
ck
{
namespace
cpu
{
enum
cpuid_vendor
{
cpuid_vendor_intel
=
0
,
cpuid_vendor_amd
=
1
,
cpuid_vendor_other
=
2
,
};
enum
cpuid_cache_type
{
cpuid_cache_type_null
=
0
,
cpuid_cache_type_dcache
=
1
,
cpuid_cache_type_icache
=
2
,
cpuid_cache_type_unified
=
3
,
};
struct
cpuid_raw
{
uint32_t
eax
{
0
};
uint32_t
ebx
{
0
};
uint32_t
ecx
{
0
};
uint32_t
edx
{
0
};
};
struct
cpuid_cache_detail
{
uint32_t
size
{
0
};
uint32_t
type
{
0
};
uint32_t
cache_line_size
{
0
};
uint32_t
associativity
{
0
};
uint32_t
sets
{
0
};
uint32_t
partitions
{
0
};
uint32_t
shared_by_procs
{
0
};
// in HT, usually maybe 2 threads per core, hence for L1/L2,
// usually this maybe 2, unless turn of HT
uint32_t
cores_per_socket
{
0
};
// hardware cores in a physical socket. there maybe multiple
// sockets on the chip. TODO: may not needed?
uint32_t
flags
{
0
};
};
struct
cpuid_cache_hierarchy
{
cpuid_cache_detail
l1i
;
cpuid_cache_detail
l1d
;
cpuid_cache_detail
l2
;
cpuid_cache_detail
l3
;
cpuid_cache_detail
l4
;
};
static
inline
cpuid_raw
cpuid
(
uint32_t
eax
,
uint32_t
ecx
)
{
// some leaf feature require ecx value.
// for others, ecx actually not used.
uint32_t
ebx
,
edx
;
asm
__volatile__
(
"mov %0, %%eax
\n
"
"mov %2, %%ecx
\n
"
"cpuid
\n
"
"mov %%eax, %0
\n
"
"mov %%ebx, %1
\n
"
"mov %%ecx, %2
\n
"
"mov %%edx, %3
\n
"
:
"=r"
(
eax
),
"=r"
(
ebx
),
"=r"
(
ecx
),
"=r"
(
edx
)
:
"0"
(
eax
),
"2"
(
ecx
));
return
{
eax
,
ebx
,
ecx
,
edx
};
}
static
inline
cpuid_vendor
cpuid_query_vendor
()
{
cpuid_raw
r
=
cpuid
(
0
,
0
);
if
(
r
.
ebx
==
0x756E6547U
/*Genu*/
&&
r
.
edx
==
0x49656E69U
/*ineI*/
&&
r
.
ecx
==
0x6C65746EU
/*ntel*/
)
{
return
cpuid_vendor_intel
;
}
if
(
r
.
ebx
==
0x68747541U
/*Auth*/
&&
r
.
edx
==
0x74656273U
/*enti*/
&&
r
.
ecx
==
0x444D4163U
/*cAMD*/
)
{
return
cpuid_vendor_amd
;
}
if
(
r
.
ebx
==
0x69444D41U
/*AMDi*/
&&
r
.
edx
==
0x69746E65U
/*sbet*/
&&
r
.
ecx
==
0x21726574U
/*ter */
)
{
return
cpuid_vendor_amd
;
}
return
cpuid_vendor_other
;
}
static
inline
cpuid_cache_hierarchy
cpuid_query_cache
()
{
cpuid_cache_hierarchy
cache_hierarchy
;
cpuid_vendor
vendor
=
cpuid_query_vendor
();
uint32_t
leaf_cache_id
=
vendor
==
cpuid_vendor_amd
?
0x8000001d
:
0x4
;
for
(
uint32_t
ecx_idx
=
0
;;
ecx_idx
++
)
{
cpuid_raw
r
=
cpuid
(
leaf_cache_id
,
ecx_idx
);
uint32_t
cache_type
=
r
.
eax
&
0x1f
;
if
(
cache_type
==
cpuid_cache_type_null
)
break
;
// Null, no more cache
uint32_t
cache_level
=
(
r
.
eax
>>
5
)
&
0x7
;
uint32_t
cache_shared_by_cores
=
1
+
((
r
.
eax
>>
14
)
&
0xfff
);
uint32_t
cache_lpp_cores
=
1
+
((
r
.
eax
>>
26
)
&
0x3f
);
uint32_t
cache_line_size
=
1
+
(
r
.
ebx
&
0xfff
);
uint32_t
cache_partitions
=
1
+
((
r
.
ebx
>>
12
)
&
0x3ff
);
uint32_t
cache_associativity
=
1
+
(
r
.
ebx
>>
22
);
uint32_t
cache_sets
=
1
+
r
.
ecx
;
switch
(
cache_level
)
{
case
1
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l1d
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l1d
.
type
=
cache_type
;
cache_hierarchy
.
l1d
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l1d
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l1d
.
sets
=
cache_sets
;
cache_hierarchy
.
l1d
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l1d
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l1d
.
cores_per_socket
=
cache_lpp_cores
;
}
else
if
(
cache_type
==
cpuid_cache_type_icache
)
{
cache_hierarchy
.
l1i
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l1i
.
type
=
cache_type
;
cache_hierarchy
.
l1i
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l1i
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l1i
.
sets
=
cache_sets
;
cache_hierarchy
.
l1i
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l1i
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l1i
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
case
2
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l2
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l2
.
type
=
cache_type
;
cache_hierarchy
.
l2
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l2
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l2
.
sets
=
cache_sets
;
cache_hierarchy
.
l2
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l2
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l2
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
case
3
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l3
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l3
.
type
=
cache_type
;
cache_hierarchy
.
l3
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l3
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l3
.
sets
=
cache_sets
;
cache_hierarchy
.
l3
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l3
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l3
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
case
4
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l4
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l4
.
type
=
cache_type
;
cache_hierarchy
.
l4
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l4
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l4
.
sets
=
cache_sets
;
cache_hierarchy
.
l4
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l4
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l4
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
}
}
return
cache_hierarchy
;
}
}
// namespace cpu
}
// namespace ck
#endif
#ifndef CK_CPUID_HPP
#define CK_CPUID_HPP
namespace
ck
{
namespace
cpu
{
enum
cpuid_vendor
{
cpuid_vendor_intel
=
0
,
cpuid_vendor_amd
=
1
,
cpuid_vendor_other
=
2
,
};
enum
cpuid_cache_type
{
cpuid_cache_type_null
=
0
,
cpuid_cache_type_dcache
=
1
,
cpuid_cache_type_icache
=
2
,
cpuid_cache_type_unified
=
3
,
};
struct
cpuid_raw
{
uint32_t
eax
{
0
};
uint32_t
ebx
{
0
};
uint32_t
ecx
{
0
};
uint32_t
edx
{
0
};
};
struct
cpuid_cache_detail
{
uint32_t
size
{
0
};
uint32_t
type
{
0
};
uint32_t
cache_line_size
{
0
};
uint32_t
associativity
{
0
};
uint32_t
sets
{
0
};
uint32_t
partitions
{
0
};
uint32_t
shared_by_procs
{
0
};
// in HT, usually maybe 2 threads per core, hence for L1/L2,
// usually this maybe 2, unless turn of HT
uint32_t
cores_per_socket
{
0
};
// hardware cores in a physical socket. there maybe multiple
// sockets on the chip. TODO: may not needed?
uint32_t
flags
{
0
};
};
struct
cpuid_cache_hierarchy
{
cpuid_cache_detail
l1i
;
cpuid_cache_detail
l1d
;
cpuid_cache_detail
l2
;
cpuid_cache_detail
l3
;
cpuid_cache_detail
l4
;
};
static
inline
cpuid_raw
cpuid
(
uint32_t
eax
,
uint32_t
ecx
)
{
// some leaf feature require ecx value.
// for others, ecx actually not used.
uint32_t
ebx
,
edx
;
asm
__volatile__
(
"mov %0, %%eax
\n
"
"mov %2, %%ecx
\n
"
"cpuid
\n
"
"mov %%eax, %0
\n
"
"mov %%ebx, %1
\n
"
"mov %%ecx, %2
\n
"
"mov %%edx, %3
\n
"
:
"=r"
(
eax
),
"=r"
(
ebx
),
"=r"
(
ecx
),
"=r"
(
edx
)
:
"0"
(
eax
),
"2"
(
ecx
));
return
{
eax
,
ebx
,
ecx
,
edx
};
}
static
inline
cpuid_vendor
cpuid_query_vendor
()
{
cpuid_raw
r
=
cpuid
(
0
,
0
);
if
(
r
.
ebx
==
0x756E6547U
/*Genu*/
&&
r
.
edx
==
0x49656E69U
/*ineI*/
&&
r
.
ecx
==
0x6C65746EU
/*ntel*/
)
{
return
cpuid_vendor_intel
;
}
if
(
r
.
ebx
==
0x68747541U
/*Auth*/
&&
r
.
edx
==
0x74656273U
/*enti*/
&&
r
.
ecx
==
0x444D4163U
/*cAMD*/
)
{
return
cpuid_vendor_amd
;
}
if
(
r
.
ebx
==
0x69444D41U
/*AMDi*/
&&
r
.
edx
==
0x69746E65U
/*sbet*/
&&
r
.
ecx
==
0x21726574U
/*ter */
)
{
return
cpuid_vendor_amd
;
}
return
cpuid_vendor_other
;
}
static
inline
cpuid_cache_hierarchy
cpuid_query_cache
()
{
cpuid_cache_hierarchy
cache_hierarchy
;
cpuid_vendor
vendor
=
cpuid_query_vendor
();
uint32_t
leaf_cache_id
=
vendor
==
cpuid_vendor_amd
?
0x8000001d
:
0x4
;
for
(
uint32_t
ecx_idx
=
0
;;
ecx_idx
++
)
{
cpuid_raw
r
=
cpuid
(
leaf_cache_id
,
ecx_idx
);
uint32_t
cache_type
=
r
.
eax
&
0x1f
;
if
(
cache_type
==
cpuid_cache_type_null
)
break
;
// Null, no more cache
uint32_t
cache_level
=
(
r
.
eax
>>
5
)
&
0x7
;
uint32_t
cache_shared_by_cores
=
1
+
((
r
.
eax
>>
14
)
&
0xfff
);
uint32_t
cache_lpp_cores
=
1
+
((
r
.
eax
>>
26
)
&
0x3f
);
uint32_t
cache_line_size
=
1
+
(
r
.
ebx
&
0xfff
);
uint32_t
cache_partitions
=
1
+
((
r
.
ebx
>>
12
)
&
0x3ff
);
uint32_t
cache_associativity
=
1
+
(
r
.
ebx
>>
22
);
uint32_t
cache_sets
=
1
+
r
.
ecx
;
switch
(
cache_level
)
{
case
1
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l1d
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l1d
.
type
=
cache_type
;
cache_hierarchy
.
l1d
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l1d
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l1d
.
sets
=
cache_sets
;
cache_hierarchy
.
l1d
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l1d
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l1d
.
cores_per_socket
=
cache_lpp_cores
;
}
else
if
(
cache_type
==
cpuid_cache_type_icache
)
{
cache_hierarchy
.
l1i
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l1i
.
type
=
cache_type
;
cache_hierarchy
.
l1i
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l1i
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l1i
.
sets
=
cache_sets
;
cache_hierarchy
.
l1i
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l1i
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l1i
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
case
2
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l2
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l2
.
type
=
cache_type
;
cache_hierarchy
.
l2
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l2
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l2
.
sets
=
cache_sets
;
cache_hierarchy
.
l2
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l2
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l2
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
case
3
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l3
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l3
.
type
=
cache_type
;
cache_hierarchy
.
l3
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l3
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l3
.
sets
=
cache_sets
;
cache_hierarchy
.
l3
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l3
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l3
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
case
4
:
if
(
cache_type
==
cpuid_cache_type_dcache
||
cache_type
==
cpuid_cache_type_unified
)
{
cache_hierarchy
.
l4
.
size
=
cache_partitions
*
cache_sets
*
cache_associativity
*
cache_line_size
;
cache_hierarchy
.
l4
.
type
=
cache_type
;
cache_hierarchy
.
l4
.
cache_line_size
=
cache_line_size
;
cache_hierarchy
.
l4
.
associativity
=
cache_associativity
;
cache_hierarchy
.
l4
.
sets
=
cache_sets
;
cache_hierarchy
.
l4
.
partitions
=
cache_partitions
;
cache_hierarchy
.
l4
.
shared_by_procs
=
cache_shared_by_cores
;
cache_hierarchy
.
l4
.
cores_per_socket
=
cache_lpp_cores
;
}
break
;
}
}
return
cache_hierarchy
;
}
}
// namespace cpu
}
// namespace ck
#endif
test/cpu_ukernel/cpu_gemm_uk.cpp
View file @
35f95fe9
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment