add and_or_b32

af2c0166 · Jing Zhang · 6d0e78bd · af2c0166 · af2c0166
Commit af2c0166 authored Oct 22, 2024 by Jing Zhang
Showing with 11 additions and 2 deletions

include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp ...or_operation/gpu/element/unary_element_wise_operation.hpp +2 -2

include/ck/utility/amd_inline_asm.hpp include/ck/utility/amd_inline_asm.hpp +9 -0

No files found.
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -19,8 +19,8 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
    // Guarantee that the `(a & b) | c` operations are LOP3s.
    // int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
    // int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
-    int lo = (q & LO) | EX;
+    int lo = amd_assembly_and_or_b32(q, LO, EX);
-    int hi = (q & HI) | EX;
+    int hi = amd_assembly_and_or_b32(q, HI, EX);
    // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
    // directly into `SUB` and `ADD`.
    const int SUB = 0xE408E408; //-8

--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -11,6 +11,15 @@
 namespace ck {
+inline __device__ int amd_assembly_and_or_b32(int a, int b, int d)
+{
+    int c;
+    asm volatile("v_and_or_b32 %0, %1, %2, %3"
+            : "=v"(c)
+            : "v"(a), "v"(b), "v"(d));
+    return c;
+}
 inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c)
 {
    half2_t d;