Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
86cc678f
Commit
86cc678f
authored
Sep 18, 2019
by
Chao Liu
Browse files
add global_load and buffer_load inline asm
parent
5b7a18c5
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
128 additions
and
86 deletions
+128
-86
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+126
-84
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+1
-1
No files found.
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
86cc678f
...
@@ -10,170 +10,203 @@ extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p
...
@@ -10,170 +10,203 @@ extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p
// global_load and global_store
// global_load and global_store
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__global_load
(
__global_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
);
const
T
*
p_src_block
,
uint32_t
src_thread_
data_
offset
,
uint32_t
src_const_
data_
offset
);
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__global_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
__device__
void
__global_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
T
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_thread_
data_
offset
,
uint32_t
dst_const_offset
);
uint32_t
dst_const_
data_
offset
);
template
<
>
template
<
>
__device__
float
__global_load
<
float
,
1
>
(
const
float
*
p_src_block
,
__device__
float
__global_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_thread_
data_
offset
,
uint32_t
src_const_offset
)
uint32_t
src_const_
data_
offset
)
{
{
#if 0 // compute on VALU
float
dst
;
float
dst
;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
#if 0 // source code
dst = p_src_block[src_const_data_offset + src_thread_data_offset];
#elif
0
// use VGPR only
const
float
*
src_thread_addr_offset_u64
=
p_src_block
+
src_const_data_offset
+
src_thread_data_offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_load_dword %0, %1
, %2,
offset:0 \n \
global_load_dword %0, %1
off
offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
: "v"(src_thread_offset_u64), "s"(p_src_block));
:
"v"
(
src_thread_addr_offset_u64
));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t
src_thread_addr_offset_u64
=
(
src_thread_data_offset
+
src_const_data_offset
)
*
sizeof
(
float
);
return dst;
asm
volatile
(
"
\n
\
#else
// compute on SALU
global_load_dword %0, %1, %2, offset:0
\n
\
float
dst
;
s_waitcnt 0
\n
\
"
uint64_t
src_thread_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_offset
);
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block
));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t
src_thread_addr_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_data_offset
*
sizeof
(
float
));
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_offset
;
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_
data_
offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_load_dword %0, %1, %2, offset:0
\n
\
global_load_dword %0, %1, %2, offset:0
\n
\
;;
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset_u64
),
"s"
(
p_src_block_with_offset
));
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block_with_offset
));
#endif
return
dst
;
return
dst
;
#endif
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__global_load
<
float
,
2
>
(
const
float
*
p_src_block
,
__device__
vector_type
<
float
,
2
>::
MemoryType
__global_load
<
float
,
2
>
(
uint32_t
src_thread_offset
,
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
uint32_t
src_const_offset
)
{
{
#if 0 // compute on VALU
using
vector_t
=
vector_type
<
float
,
2
>::
MemoryType
;
vector_type<float, 2>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
vector_t
dst
;
#if 0 // source code
dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif
0
// use VGPR only
const
float
*
src_thread_addr_offset_u64
=
p_src_block
+
src_const_data_offset
+
src_thread_data_offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1
, %2,
offset:0 \n \
global_load_dwordx2 %0, %1
off
offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
: "v"(src_thread_offset_u64), "s"(p_src_block));
:
"v"
(
src_thread_addr_offset_u64
));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t
src_thread_addr_offset_u64
=
(
src_thread_data_offset
+
src_const_data_offset
)
*
sizeof
(
float
);
return dst;
asm
volatile
(
"
\n
\
#else
// compute on SALU
global_load_dwordx2 %0, %1, %2, offset:0
\n
\
vector_type
<
float
,
2
>::
MemoryType
dst
;
s_waitcnt 0
\n
\
"
uint64_t
src_thread_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_offset
);
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block
));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t
src_thread_addr_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_data_offset
*
sizeof
(
float
));
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_offset
;
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_
data_
offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_load_dwordx2 %0, %1, %2, offset:0
\n
\
global_load_dwordx2 %0, %1, %2, offset:0
\n
\
;;
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset_u64
),
"s"
(
p_src_block_with_offset
));
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block_with_offset
));
#endif
return
dst
;
return
dst
;
#endif
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__global_load
<
float
,
4
>
(
const
float
*
p_src_block
,
__device__
vector_type
<
float
,
4
>::
MemoryType
__global_load
<
float
,
4
>
(
uint32_t
src_thread_offset
,
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
uint32_t
src_const_offset
)
{
{
#if 0 // compute on VALU
using
vector_t
=
vector_type
<
float
,
4
>::
MemoryType
;
vector_type<float, 4>::MemoryType dst;
vector_t
dst
;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
#if 0 // source code
dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif
0
// use VGPR only
const
float
*
src_thread_addr_offset_u64
=
p_src_block
+
src_const_data_offset
+
src_thread_data_offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1
, %2,
offset:0 \n \
global_load_dwordx4 %0, %1
off
offset:0
\n
\
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
: "v"(src_thread_offset_u64), "s"(p_src_block));
:
"v"
(
src_thread_addr_offset_u64
));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t
src_thread_addr_offset_u64
=
(
src_thread_data_offset
+
src_const_data_offset
)
*
sizeof
(
float
);
return dst;
asm
volatile
(
"
\n
\
#else
// compute on SALU
global_load_dwordx4 %0, %1, %2, offset:0
\n
\
vector_type
<
float
,
4
>::
MemoryType
dst
;
s_waitcnt 0
\n
\
"
uint64_t
src_thread_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_offset
);
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block
));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t
src_thread_addr_offset_u64
=
static_cast
<
uint64_t
>
(
src_thread_data_offset
*
sizeof
(
float
));
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_offset
;
const
float
*
p_src_block_with_offset
=
p_src_block
+
src_const_
data_
offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_load_dwordx4 %0, %1, %2, offset:0
\n
\
global_load_dwordx4 %0, %1, %2, offset:0
\n
\
;;
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset_u64
),
"s"
(
p_src_block_with_offset
));
:
"v"
(
src_thread_addr_offset_u64
),
"s"
(
p_src_block_with_offset
));
#endif
return
dst
;
return
dst
;
#endif
}
}
template
<
>
template
<
>
__device__
void
__global_store
<
float
,
1
>
(
const
float
&
src
,
__device__
void
__global_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
float
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_thread_
data_
offset
,
uint32_t
dst_const_offset
)
uint32_t
dst_const_
data_
offset
)
{
{
#if 0 // compute on VALU
#if 0 // compute on VALU
uint64_t dst_thread_offset_u64 =
static_cast<uint64_t>
(dst_thread_offset + dst_const_
offse
t);
uint64_t dst_thread_
data_
offset_u64 = (dst_thread_
data_
offset + dst_const_
data_offset) * sizeof(floa
t);
asm volatile("\n \
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
global_store_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
"
:
:
: "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
: "v"(dst_thread_
data_
offset_u64), "v"(src), "s"(p_dst_block));
#else
// compute on SALU
#else
// compute on SALU
uint64_t
dst_thread_offset_u64
=
static_cast
<
uint64_t
>
(
dst_thread_offse
t
);
uint64_t
dst_thread_
data_
offset_u64
=
dst_thread_data_offset
*
sizeof
(
floa
t
);
float
*
p_dst_block_with_offset
=
p_dst_block
+
dst_const_offset
;
float
*
p_dst_block_with_offset
=
p_dst_block
+
dst_const_
data_
offset
;
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
global_store_dword %0, %1, %2, offset:0
\n
\
global_store_dword %0, %1, %2, offset:0
\n
\
;;s_waitcnt 0
\n
\
"
"
:
:
:
"v"
(
dst_thread_offset_u64
),
"v"
(
src
),
"s"
(
p_dst_block_with_offset
));
:
"v"
(
dst_thread_
data_
offset_u64
),
"v"
(
src
),
"s"
(
p_dst_block_with_offset
));
#endif
#endif
}
}
// __buffer_load and __buffer_store
// __buffer_load and __buffer_store
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__buffer_load
(
__buffer_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
);
const
T
*
p_src_block
,
uint32_t
src_thread_
data_
offset
,
uint32_t
src_const_
data_
offset
);
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
__device__
void
__buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
T
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_thread_
data_
offset
,
uint32_t
dst_const_offset
);
uint32_t
dst_const_
data_
offset
);
template
<
>
template
<
>
__device__
float
__buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
__device__
float
__buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_thread_
data_
offset
,
uint32_t
src_const_offset
)
uint32_t
src_const_
data_
offset
)
{
{
float
dst
;
float
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
...
@@ -184,21 +217,23 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
...
@@ -184,21 +217,23 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
;;
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
:
"v"
(
src_thread_
addr_
offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_
addr_
offset
));
return
dst
;
return
dst
;
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
__buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
__device__
vector_type
<
float
,
2
>::
MemoryType
__buffer_load
<
float
,
2
>
(
uint32_t
src_thread_offset
,
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
uint32_t
src_const_offset
)
{
{
vector_type
<
float
,
2
>::
MemoryType
dst
;
vector_type
<
float
,
2
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
...
@@ -209,21 +244,23 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
...
@@ -209,21 +244,23 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
;;
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
:
"v"
(
src_thread_
addr_
offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_
addr_
offset
));
return
dst
;
return
dst
;
}
}
template
<
>
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
__buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
__device__
vector_type
<
float
,
4
>::
MemoryType
__buffer_load
<
float
,
4
>
(
uint32_t
src_thread_offset
,
const
float
*
p_src_block
,
uint32_t
src_thread_data_offset
,
uint32_t
src_const_data_offset
)
uint32_t
src_const_offset
)
{
{
vector_type
<
float
,
4
>::
MemoryType
dst
;
vector_type
<
float
,
4
>::
MemoryType
dst
;
uint32_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
uint32_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
int32x4_t
src_block_setting
{
0
};
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
...
@@ -234,10 +271,10 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
...
@@ -234,10 +271,10 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
;;
s_waitcnt 0
\n
\
s_waitcnt 0
\n
\
"
"
:
"=v"
(
dst
)
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
:
"v"
(
src_thread_
addr_
offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_
addr_
offset
));
return
dst
;
return
dst
;
}
}
...
@@ -245,9 +282,12 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
...
@@ -245,9 +282,12 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
template
<
>
template
<
>
__device__
void
__buffer_store
<
float
,
1
>
(
const
float
&
src
,
__device__
void
__buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
float
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_thread_
data_
offset
,
uint32_t
dst_const_offset
)
uint32_t
dst_const_
data_
offset
)
{
{
uint32_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
uint32_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
int32x4_t
dst_block_setting
{
0
};
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
...
@@ -258,10 +298,12 @@ __device__ void __buffer_store<float, 1>(const float& src,
...
@@ -258,10 +298,12 @@ __device__ void __buffer_store<float, 1>(const float& src,
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
;;s_waitcnt 0
\n
\
"
"
:
:
:
"s"
(
dst_block_setting
),
"v"
(
src
),
"v"
(
dst_thread_offset
),
"s"
(
dst_const_offset
));
:
"s"
(
dst_block_setting
),
"v"
(
src
),
"v"
(
dst_thread_addr_offset
),
"s"
(
dst_const_addr_offset
));
}
}
__device__
void
vmcnt
(
index_t
cnt
)
__device__
void
vmcnt
(
index_t
cnt
)
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
86cc678f
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
...
...
driver/src/driver.cpp
View file @
86cc678f
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment