Commit af2c0166 authored by Jing Zhang's avatar Jing Zhang
Browse files

add and_or_b32

parent 6d0e78bd
...@@ -19,8 +19,8 @@ __host__ __device__ inline half4_t pki4_to_half4(int q) ...@@ -19,8 +19,8 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
// int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); // int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
// int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = (q & LO) | EX; int lo = amd_assembly_and_or_b32(q, LO, EX);
int hi = (q & HI) | EX; int hi = amd_assembly_and_or_b32(q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0xE408E408; //-8 const int SUB = 0xE408E408; //-8
......
...@@ -11,6 +11,15 @@ ...@@ -11,6 +11,15 @@
namespace ck { namespace ck {
inline __device__ int amd_assembly_and_or_b32(int a, int b, int d)
{
int c;
asm volatile("v_and_or_b32 %0, %1, %2, %3"
: "=v"(c)
: "v"(a), "v"(b), "v"(d));
return c;
}
inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c) inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c)
{ {
half2_t d; half2_t d;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment