Commit 977fc1ed authored by Jing Zhang's avatar Jing Zhang
Browse files

clean

parent e6230689
...@@ -610,10 +610,10 @@ int main(int argc, char* argv[]) ...@@ -610,10 +610,10 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
// 1x1 filter, 14x14 image, C = 512 // 1x1 filter, 14x14 image, C = 512
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 512;
constexpr index_t HI = 14; constexpr index_t HI = 14;
constexpr index_t WI = 14; constexpr index_t WI = 14;
constexpr index_t K = 128; constexpr index_t K = 512;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
......
...@@ -13,7 +13,7 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; ...@@ -13,7 +13,7 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
#define data4_t vector_type<float, 4>::MemoryType #define data4_t vector_type<float, 4>::MemoryType
#define data_t float #define data_t float
template<unsigned cnt> template<unsigned cnt>
inline __device__ void vmcnt() inline __device__ void vmcnt()
{ {
#if !NO_VM_WAIT #if !NO_VM_WAIT
...@@ -95,8 +95,8 @@ inline __device__ void lgkmcnt(index_t cnt) ...@@ -95,8 +95,8 @@ inline __device__ void lgkmcnt(index_t cnt)
} }
inline __device__ void outerProduct1x4(const data_t& a, inline __device__ void outerProduct1x4(const data_t& a,
const data4_t& b, const data4_t& b,
data4_t& c) data4_t& c)
{ {
#if 0 #if 0
asm volatile( asm volatile(
...@@ -111,8 +111,8 @@ inline __device__ void outerProduct1x4(const data_t& a, ...@@ -111,8 +111,8 @@ inline __device__ void outerProduct1x4(const data_t& a,
"v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
); );
#else #else
//hijack here due to a compiler issue that cannot perform proper register //hijack here due to a compiler issue that cannot perform proper register
//mapping for float4 c //mapping for float4 c
data_t *c_p = (data_t *)&c; data_t *c_p = (data_t *)&c;
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %4, %5 \n \ v_mac_f32 %0, %4, %5 \n \
...@@ -120,25 +120,25 @@ inline __device__ void outerProduct1x4(const data_t& a, ...@@ -120,25 +120,25 @@ inline __device__ void outerProduct1x4(const data_t& a,
v_mac_f32 %2, %4, %7 \n \ v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \ v_mac_f32 %3, %4, %8 \n \
" "
: "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3]) : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
: "v"(a), : "v"(a),
"v"(b.x), "v"(b.x),
"v"(b.y), "v"(b.y),
"v"(b.z), "v"(b.z),
"v"(b.w), "v"(b.w),
"0"(c[0]), "0"(c[0]),
"1"(c[1]), "1"(c[1]),
"2"(c[2]), "2"(c[2]),
"3"(c[3])); "3"(c[3]));
#endif #endif
} }
inline __device__ void outerProduct4x4(const data4_t& a, inline __device__ void outerProduct4x4(const data4_t& a,
const data4_t& b, const data4_t& b,
data4_t& c0, data4_t& c0,
data4_t& c1, data4_t& c1,
data4_t& c2, data4_t& c2,
data4_t& c3) data4_t& c3)
{ {
outerProduct1x4(a.x, b, c0); outerProduct1x4(a.x, b, c0);
outerProduct1x4(a.y, b, c1); outerProduct1x4(a.y, b, c1);
...@@ -147,8 +147,8 @@ inline __device__ void outerProduct4x4(const data4_t& a, ...@@ -147,8 +147,8 @@ inline __device__ void outerProduct4x4(const data4_t& a,
} }
inline __device__ void outerProduct8x8(const data4_t* a, inline __device__ void outerProduct8x8(const data4_t* a,
const data4_t* b, const data4_t* b,
data4_t* c) data4_t* c)
{ {
outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]); outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]); outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
...@@ -164,216 +164,216 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0) ...@@ -164,216 +164,216 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0)
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:0 \n \ ds_read_b128 %0, %1 offset:0 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 128) else if(offset == 128)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:128 \n \ ds_read_b128 %0, %1 offset:128 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 256) else if(offset == 256)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:256 \n \ ds_read_b128 %0, %1 offset:256 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 384) else if(offset == 384)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:384 \n \ ds_read_b128 %0, %1 offset:384 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 512) else if(offset == 512)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:512 \n \ ds_read_b128 %0, %1 offset:512 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 640) else if(offset == 640)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:640 \n \ ds_read_b128 %0, %1 offset:640 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 768) else if(offset == 768)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:768 \n \ ds_read_b128 %0, %1 offset:768 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 896) else if(offset == 896)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:896 \n \ ds_read_b128 %0, %1 offset:896 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1024) else if(offset == 1024)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1024 \n \ ds_read_b128 %0, %1 offset:1024 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1152) else if(offset == 1152)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1152 \n \ ds_read_b128 %0, %1 offset:1152 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1280) else if(offset == 1280)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1280 \n \ ds_read_b128 %0, %1 offset:1280 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1408) else if(offset == 1408)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1408 \n \ ds_read_b128 %0, %1 offset:1408 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1536) else if(offset == 1536)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1536 \n \ ds_read_b128 %0, %1 offset:1536 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1664) else if(offset == 1664)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1664 \n \ ds_read_b128 %0, %1 offset:1664 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1792) else if(offset == 1792)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1792 \n \ ds_read_b128 %0, %1 offset:1792 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 1920) else if(offset == 1920)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:1920 \n \ ds_read_b128 %0, %1 offset:1920 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 2048) else if(offset == 2048)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:2048 \n \ ds_read_b128 %0, %1 offset:2048 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 2176) else if(offset == 2176)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:2176 \n \ ds_read_b128 %0, %1 offset:2176 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 2304) else if(offset == 2304)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:2304 \n \ ds_read_b128 %0, %1 offset:2304 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 2560) else if(offset == 2560)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:2560 \n \ ds_read_b128 %0, %1 offset:2560 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 2816) else if(offset == 2816)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:2816 \n \ ds_read_b128 %0, %1 offset:2816 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 3072) else if(offset == 3072)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:3072 \n \ ds_read_b128 %0, %1 offset:3072 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 3328) else if(offset == 3328)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:3328 \n \ ds_read_b128 %0, %1 offset:3328 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 3584) else if(offset == 3584)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:3584 \n \ ds_read_b128 %0, %1 offset:3584 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 3840) else if(offset == 3840)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:3840 \n \ ds_read_b128 %0, %1 offset:3840 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 4096) else if(offset == 4096)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:4096 \n \ ds_read_b128 %0, %1 offset:4096 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else if(offset == 4352) else if(offset == 4352)
{ {
asm volatile("\n \ asm volatile("\n \
ds_read_b128 %0, %1 offset:4352 \n \ ds_read_b128 %0, %1 offset:4352 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(__to_local(lds))); : "v"(__to_local(lds)));
} }
else else
{ {
...@@ -383,8 +383,8 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0) ...@@ -383,8 +383,8 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0)
} }
inline __device__ void global_store(data4_t& r, inline __device__ void global_store(data4_t& r,
const void* vptr, const void* vptr,
const void* sprt = 0) const void* sprt = 0)
{ {
#if !NO_GLB_READ #if !NO_GLB_READ
if(sprt == 0) if(sprt == 0)
...@@ -407,8 +407,8 @@ inline __device__ void global_store(data4_t& r, ...@@ -407,8 +407,8 @@ inline __device__ void global_store(data4_t& r,
inline __device__ void global_load(data4_t& r, inline __device__ void global_load(data4_t& r,
const void* vptr, const void* vptr,
const void* sprt = 0) const void* sprt = 0)
{ {
#if !NO_GLB_READ #if !NO_GLB_READ
if(sprt == 0) if(sprt == 0)
...@@ -421,26 +421,26 @@ inline __device__ void global_load(data4_t& r, ...@@ -421,26 +421,26 @@ inline __device__ void global_load(data4_t& r,
} }
else else
{ {
asm volatile("\n \ asm volatile("\n \
global_load_dwordx4 %0, %1, %2 \n \ global_load_dwordx4 %0, %1, %2 \n \
" "
: "=v"(r) : "=v"(r)
: "v"(vptr), "s"(sprt)); : "v"(vptr), "s"(sprt));
} }
#endif #endif
} }
inline __device__ void inline __device__ void
ds_write_b128(const data4_t& r, void* lds, index_t offset = 0) ds_write_b128(const data4_t& r, void* lds, index_t offset = 0)
{ {
#if !NO_DS_WRITE #if !NO_DS_WRITE
if(offset == 0) if(offset == 0)
{ {
asm volatile("\n \ asm volatile("\n \
ds_write_b128 %0, %1 \n \ ds_write_b128 %0, %1 \n \
" "
: :
: "v"(__to_local(lds)), "v"(r)); : "v"(__to_local(lds)), "v"(r));
} }
else else
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment