ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子， (#629)

* ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子，包括gelu,layer_norm，lp_norm(支持l1，l2 norm)，relu，softmax，tanh。 --------- Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>

ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子， (#629)
* ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子，包括gelu,layer_norm，lp_norm(支持l1，l2 norm)，relu，softmax，tanh。 --------- Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
85bc98ac · qinyiqun · GitHub · 7c397dd2 · 85bc98ac · 85bc98ac
Unverified Commit 85bc98ac authored Nov 21, 2025 by qinyiqun Committed by GitHub Nov 21, 2025
10 changed files
--- a/test/infiniop/layer_norm.py
+++ b/test/infiniop/layer_norm.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    # shape, bias_exist, eps, input_strides, output_strides, weight_strides
+    ((5, 4), True, 1e-5, None, None, None),
+    ((5, 4, 32, 2048), True, 1e-5, None, None, None),
+    ((13, 4, 4), True, 1e-5, [30, 4, 1], [50, 4, 1], [2]),
+    ((16, 5, 563), True, 1e-4, None, None, None),
+    ((5, 16, 563), False, 1e-5, None, None, [10]),
+    ((4, 4, 563), True, 1e-5, None, None, None),
+    ((40, 40, 56), True, 1e-5, [3600, 56, 1], None, None),
+    ((40, 40, 56), False, 1e-5, [3600, 56, 1], None, None),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 5e-2, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 5e-2, "rtol": 5e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_layer_norm(
+    output: torch.Tensor,
+    input_standardization: torch.Tensor,
+    input_std_deviation: torch.Tensor,
+    input: torch.Tensor,
+    weight,
+    bias,
+    eps,
+    bias_exist: bool,
+):
+    normalized_shape = input.shape[-1:]
+    ln = torch.nn.LayerNorm(
+        normalized_shape=normalized_shape,
+        eps=eps,
+        dtype=torch.float,
+        bias=bias_exist,
+        device=input.device,
+    )
+    ln.weight.data = weight.type(torch.float)
+    if bias_exist:
+        ln.bias.data = bias.type(torch.float)
+    input = input.type(torch.float)
+    mean = input.mean(dim=-1, keepdim=True)
+    var = input.var(dim=-1, correction=0)
+    std = torch.sqrt(var + eps)
+    input_standardization.copy_(
+        ((input - mean) / std.unsqueeze(2)).type(input_standardization.dtype)
+    )
+    input_std_deviation.copy_(std.type(input_standardization.dtype))
+    output.copy_(ln(input).detach().type(output.dtype))
+
+
+def layer_norm(
+    output: torch.Tensor, input: torch.Tensor, weight, bias, eps, bias_exist: bool
+):
+    normalized_shape = input.shape[-1:]
+    ln = torch.nn.LayerNorm(
+        normalized_shape=normalized_shape, eps=eps, bias=bias_exist, device=input.device
+    )
+
+    ln.weight.data = weight
+    if bias_exist:
+        ln.bias.data = bias
+    output.copy_(ln.forward(input).detach().type(output.dtype))
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    bias_exist,
+    eps,
+    input_strides,
+    output_strides,
+    weight_strides,
+    inplace,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing layer_norm on {InfiniDeviceNames[device]} with input_shape:{input_shape},"
+        f"bias:{bias_exist},eps:{eps},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input_standardization = TestTensor(
+        input_shape,
+        None,
+        dtype,
+        device,
+    )
+
+    input_std_deviation = TestTensor(
+        input_shape[:-1],
+        None,
+        dtype,
+        device,
+    )
+
+    input = TestTensor(input_shape, input_strides, dtype, device, mode="zeros")
+    if inplace == Inplace.INPLACE:
+        if output_strides != input_strides:
+            return
+        output = input
+    else:
+        output = TestTensor(
+            input_shape,
+            output_strides,
+            dtype,
+            device,
+        )
+
+    weight = TestTensor(
+        input_shape[-1:],
+        weight_strides,
+        dtype,
+        device,
+    )
+
+    bias = (
+        TestTensor(
+            input_shape[-1:],
+            None,
+            dtype,
+            device,
+        )
+        if bias_exist
+        else None
+    )
+
+    layer_norm(
+        output.torch_tensor(),
+        input.torch_tensor(),
+        weight.torch_tensor(),
+        bias.torch_tensor() if bias_exist else None,
+        eps,
+        bias_exist,
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLayerNormDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input_standardization.descriptor,
+            input_std_deviation.descriptor,
+            input.descriptor,
+            weight.descriptor,
+            bias.descriptor if bias_exist else None,
+            eps,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in (
+        [output, input_standardization, input_std_deviation, input, weight] + [bias]
+        if bias_exist
+        else []
+    ):
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLayerNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_layer_norm():
+        check_error(
+            LIBINFINIOP.infiniopLayerNorm(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input_standardization.data(),
+                input_std_deviation.data(),
+                input.data(),
+                weight.data(),
+                bias.data() if bias_exist else None,
+                None,
+            )
+        )
+
+    lib_layer_norm()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        debug(
+            input_standardization.actual_tensor(),
+            input_standardization.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+        debug(
+            input_std_deviation.actual_tensor(),
+            input_std_deviation.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+    assert torch.allclose(
+        input_standardization.actual_tensor(),
+        input_standardization.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        input_std_deviation.actual_tensor(),
+        input_std_deviation.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_layer_norm(
+            output, input_standardization, input_std_deviation, input, weight, bias, eps, bias_exist
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_layer_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLayerNormDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my layer_norm passed!\033[0m")
--- a/test/infiniop/libinfiniop/devices.py
+++ b/test/infiniop/libinfiniop/devices.py
@@ -8,6 +8,7 @@ class InfiniDeviceEnum:
    ILUVATAR = 6
    KUNLUN = 7
    HYGON = 8
+    QY = 9


 InfiniDeviceNames = {
@@ -20,6 +21,7 @@ InfiniDeviceNames = {
    InfiniDeviceEnum.ILUVATAR: "Iluvatar",
    InfiniDeviceEnum.KUNLUN: "Kunlun",
    InfiniDeviceEnum.HYGON: "Hygon",
+    InfiniDeviceEnum.QY: "QY",
 }

 # Mapping that maps InfiniDeviceEnum to torch device string
@@ -33,4 +35,5 @@ torch_device_map = {
    InfiniDeviceEnum.ILUVATAR: "cuda",
    InfiniDeviceEnum.KUNLUN: "cuda",
    InfiniDeviceEnum.HYGON: "cuda",
+    InfiniDeviceEnum.QY: "cuda",
 }
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -456,6 +456,39 @@ def sub_(lib):
    ]


+@OpRegister.operator
+def softmax_(lib):
+    lib.infiniopCreateSoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateSoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+
+    lib.infiniopGetSoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetSoftmaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSoftmax.restype = c_int32
+    lib.infiniopSoftmax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroySoftmaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def swiglu_(lib):
    lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
@@ -578,7 +611,7 @@ def topksoftmax_(lib):
        c_void_p,
        c_void_p,
        c_size_t,
-        c_int32, 
+        c_int32,
        c_void_p,
    ]
    lib.infiniopDestroyTopksoftmaxDescriptor.restype = c_int32
@@ -594,7 +627,7 @@ def topkrouter_(lib):
        infiniopHandle_t,
        POINTER(infiniopOperatorDescriptor_t),
        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t
+        infiniopTensorDescriptor_t,
    ]

    lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
@@ -706,6 +739,7 @@ def zeros_(lib):
        infiniopOperatorDescriptor_t,
    ]

+
 @OpRegister.operator
 def ones_(lib):
    lib.infiniopCreateOnesDescriptor.restype = c_int32
@@ -738,6 +772,38 @@ def ones_(lib):
    ]


+@OpRegister.operator
+def gelu_(lib):
+    lib.infiniopCreateGeluDescriptor.restype = c_int32
+    lib.infiniopCreateGeluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGeluWorkspaceSize.restype = c_int32
+    lib.infiniopGetGeluWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGelu.restype = c_int32
+    lib.infiniopGelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGeluDescriptor.restype = c_int32
+    lib.infiniopDestroyGeluDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def silu_(lib):
    lib.infiniopCreateSiluDescriptor.restype = c_int32
@@ -768,3 +834,107 @@ def silu_(lib):
    lib.infiniopDestroySiluDescriptor.argtypes = [
        infiniopOperatorDescriptor_t,
    ]
+
+
+@OpRegister.operator
+def layer_norm_(lib):
+    lib.infiniopCreateLayerNormDescriptor.restype = c_int32
+    lib.infiniopCreateLayerNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+    lib.infiniopGetLayerNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetLayerNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLayerNorm.restype = c_int32
+    lib.infiniopLayerNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLayerNormDescriptor.restype = c_int32
+    lib.infiniopDestroyLayerNormDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def lp_norm_(lib):
+    lib.infiniopCreateLPNormDescriptor.restype = c_int32
+    lib.infiniopCreateLPNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+        c_int32,
+        c_float,
+    ]
+
+    lib.infiniopGetLPNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetLPNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLPNorm.restype = c_int32
+    lib.infiniopLPNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLPNormDescriptor.restype = c_int32
+    lib.infiniopDestroyLPNormDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def tanh_(lib):
+    lib.infiniopCreateTanhDescriptor.restype = c_int32
+    lib.infiniopCreateTanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetTanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetTanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopTanh.restype = c_int32
+    lib.infiniopTanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyTanhDescriptor.restype = c_int32
+    lib.infiniopDestroyTanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -94,9 +94,17 @@ class TestTensor(CTensor):
        elif mode == "randint":
            randint_low = -2000000000 if randint_low is None else randint_low
            randint_high = 2000000000 if randint_high is None else randint_high
-            self._torch_tensor = torch.randint(randint_low,randint_high, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
+            self._torch_tensor = torch.randint(
+                randint_low,
+                randint_high,
+                torch_shape,
+                dtype=to_torch_dtype(dt),
+                device=torch_device_map[device],
+            )
        elif mode == "float8_e4m3fn":
-            self._torch_tensor = torch.rand(shape, dtype=torch.float32, device=torch_device_map[device]).to(dtype=torch.float8_e4m3fn)
+            self._torch_tensor = torch.rand(
+                shape, dtype=torch.float32, device=torch_device_map[device]
+            ).to(dtype=torch.float8_e4m3fn)
        elif mode == "manual":
            assert set_tensor is not None
            assert torch_shape == list(set_tensor.shape)
@@ -136,14 +144,19 @@ class TestTensor(CTensor):

    def is_broadcast(self):
        return self.strides is not None and 0 in self.strides
-    
+
    @staticmethod
-    def from_binary(binary_file, shape, strides, dt: InfiniDtype, device: InfiniDeviceEnum):
+    def from_binary(
+        binary_file, shape, strides, dt: InfiniDtype, device: InfiniDeviceEnum
+    ):
        data = np.fromfile(binary_file, dtype=to_numpy_dtype(dt))
        base = torch.from_numpy(data)
-        torch_tensor = torch.as_strided(base, size=shape, stride=strides).to(torch_device_map[device])
+        torch_tensor = torch.as_strided(base, size=shape, stride=strides).to(
+            torch_device_map[device]
+        )
        return TestTensor(
-            shape, strides, dt, device, mode="binary", set_tensor=torch_tensor)
+            shape, strides, dt, device, mode="binary", set_tensor=torch_tensor
+        )

    @staticmethod
    def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
@@ -156,6 +169,9 @@ class TestTensor(CTensor):
    def update_torch_tensor(self, new_tensor: torch.Tensor):
        self._torch_tensor = new_tensor

+    def update_torch_tensor(self, new_tensor: torch.Tensor):
+        self._torch_tensor = new_tensor
+

 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
    if dt == InfiniDtype.BOOL:
@@ -225,7 +241,6 @@ def to_numpy_dtype(dt: InfiniDtype, compatability_mode=False):
        raise ValueError("Unsupported data type")


-
 class TestWorkspace:
    def __init__(self, size, device):
        if size != 0:
@@ -294,7 +309,18 @@ def rearrange_tensor(tensor, new_strides):
    new_positions += offset

    # Copy the original data to the new tensor
-    if tensor.dtype in [torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32,torch.int64, torch.float16,torch.bfloat16,torch.float32,torch.float64]:
+    if tensor.dtype in [
+        torch.bool,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        torch.float64,
+    ]:
        new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
    elif tensor.dtype in [torch.uint16, torch.uint32, torch.uint64]:
        new_tensor_int64 = new_tensor.to(dtype=torch.int64)
@@ -303,12 +329,14 @@ def rearrange_tensor(tensor, new_strides):
        new_tensor = new_tensor_int64.to(dtype=tensor.dtype)
    elif tensor.dtype in [torch.float8_e4m3fn]:
        new_tensor_float64 = new_tensor.to(dtype=torch.float64)
-        tensor_float64  = tensor.to(dtype=torch.float64)
-        new_tensor_float64.view(-1).index_add_(0, new_positions, tensor_float64.view(-1))
+        tensor_float64 = tensor.to(dtype=torch.float64)
+        new_tensor_float64.view(-1).index_add_(
+            0, new_positions, tensor_float64.view(-1)
+        )
        new_tensor = new_tensor_float64.to(dtype=tensor.dtype)
    else:
        raise ValueError("Unsupported data type")
-    
+
    new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))

    return new_tensor
@@ -355,6 +383,11 @@ def get_args():
        action="store_true",
        help="Run Iluvatar GPU test",
    )
+    parser.add_argument(
+        "--qy",
+        action="store_true",
+        help="Run Qy GPU test",
+    )
    parser.add_argument(
        "--cambricon",
        action="store_true",
@@ -515,7 +548,7 @@ def print_discrepancy(

    actual = actual.to("cpu")
    expected = expected.to("cpu")
-    
+
    actual_isnan = torch.isnan(actual)
    expected_isnan = torch.isnan(expected)

@@ -525,7 +558,8 @@ def print_discrepancy(
    )

    diff_mask = nan_mismatch | (
-        torch.abs(actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)) > (atol + rtol * torch.abs(expected.to(dtype=torch.float64)))
+        torch.abs(actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64))
+        > (atol + rtol * torch.abs(expected.to(dtype=torch.float64)))
    )
    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
    delta = actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)
@@ -670,6 +704,8 @@ def get_test_devices(args):
        devices_to_test.append(InfiniDeviceEnum.NVIDIA)
    if args.iluvatar:
        devices_to_test.append(InfiniDeviceEnum.ILUVATAR)
+    if args.qy:
+        devices_to_test.append(InfiniDeviceEnum.QY)
    if args.cambricon:
        import torch_mlu


--- a/test/infiniop/lp_norm.py
+++ b/test/infiniop/lp_norm.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_strides, y_strides, axis, p, eps
+    ((2, 1, 512), [17408, 1024, 1], [17408, 1024, 1], -1, 2, 1e-12),
+    ((2, 1, 1024), [17408, 1024, 1], [17408, 1024, 1], -1, 2, 1e-12),
+    ((2, 1, 2048), [17408, 1024, 1], [17408, 1024, 1], -1, 2, 1e-12),
+    ((2048, 2050), None, None, 0, 1, 1e-12),
+    ((2048, 2050), None, None, 1, 1, 1e-12),
+    ((12, 16, 512, 512), None, None, 0, 2, 1e-12),
+    ((12, 16, 512, 512), None, None, 1, 2, 1e-12),
+    ((12, 16, 512, 512), None, None, 2, 1, 1e-12),
+    ((12, 16, 512, 512), None, None, 3, 2, 1e-12),
+    ((1, 16, 512, 512), None, None, 0, 2, 1e-12),
+    ((1, 16, 512, 512), None, None, 1, 1, 1e-12),
+    ((1, 16, 512, 512), None, None, 2, 2, 1e-12),
+    ((1, 16, 512, 512), None, None, 3, 2, 1e-12),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 1e-5},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.INPLACE_X,
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def lp_norm(x, axis, p, eps):
+    return torch.nn.functional.normalize(
+        x.to(torch.float32), dim=axis, p=p, eps=eps
+    ).to(x.dtype)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_strides,
+    y_strides,
+    axis,
+    p,
+    eps,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing LPNorm on {InfiniDeviceNames[device]} with shape:{shape}, y_strides:{y_strides}, x_strides:{x_strides}, axis:{axis}, p:{p}, eps:{eps} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    x = TestTensor(shape, x_strides, dtype, device)
+    ans = lp_norm(x.torch_tensor(), axis, p, eps)
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, y_strides, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLPNormDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, axis, p, eps
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLPNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_lp_norm():
+        check_error(
+            LIBINFINIOP.infiniopLPNorm(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_lp_norm()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: lp_norm(x.torch_tensor(), axis, p, eps), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_lp_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLPNormDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/sigmoid.py
+++ b/test/infiniop/sigmoid.py
@@ -43,6 +43,7 @@ class Inplace(Enum):
    OUT_OF_PLACE = auto()
    INPLACE_X = auto()

+
 # Inplace options applied for each test case in _TEST_CASES_
 _INPLACE = [
    Inplace.OUT_OF_PLACE,
@@ -71,9 +72,11 @@ PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000

+
 def torch_sigmoid(y, x):
    torch.sigmoid(x, out=y)

+
 def test(
    handle,
    device,
@@ -169,4 +172,3 @@ if __name__ == "__main__":
        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)

    print("\033[92m  Test passed!  \033[0m")
-
--- a/test/infiniop/softmax.py
+++ b/test/infiniop/softmax.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, axis
+    ((4, 4), 0),
+    ((12, 16, 512, 512), 0),
+    ((12, 16, 512, 512), 1),
+    ((12, 16, 512, 512), 2),
+    ((12, 16, 512, 512), 3),
+    ((1, 16, 512, 512), 0),
+    ((1, 16, 512, 512), 1),
+    ((1, 16, 512, 512), 2),
+    ((1, 16, 512, 512), 3),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 1e-5},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.INPLACE_X,
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def softmax(x, axis):
+    return torch.softmax(x, axis)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    axis,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Softmax on {InfiniDeviceNames[device]} with shape:{shape}, axis:{axis} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    x = TestTensor(shape, None, dtype, device)
+    ans = softmax(x.torch_tensor(), axis)
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSoftmaxDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, axis
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSoftmaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_softmax():
+        check_error(
+            LIBINFINIOP.infiniopSoftmax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_softmax()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: softmax(x.torch_tensor(), axis), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySoftmaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/tanh.py
+++ b/test/infiniop/tanh.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,) for test_case in _TEST_CASES_ for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def tanh(output, input):
+    output.copy_(torch.tanh(input))
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    tanh(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanhDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_tanh():
+        check_error(
+            LIBINFINIOP.infiniopTanh(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_tanh()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
--- a/xmake.lua
+++ b/xmake.lua
@@ -102,6 +102,18 @@ if has_config("iluvatar-gpu") then
    includes("xmake/iluvatar.lua")
 end

+-- qy
+option("qy-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to compile implementations for Qy GPU")
+option_end()
+
+if has_config("qy-gpu") then
+    add_defines("ENABLE_QY_API")
+    includes("xmake/qy.lua")
+end
+
 -- 沐曦
 option("metax-gpu")
    set_default(false)
@@ -228,6 +240,10 @@ target("infinirt")
    if has_config("iluvatar-gpu") then
        add_deps("infinirt-iluvatar")
    end
+    if has_config("qy-gpu") then
+        add_deps("infinirt-qy")
+        add_files("build/.objs/infinirt-qy/rules/qy.cuda/src/infinirt/cuda/*.cu.o", {public = true})
+    end
    if has_config("kunlun-xpu") then
        add_deps("infinirt-kunlun")
    end
@@ -253,6 +269,11 @@ target("infiniop")
    if has_config("iluvatar-gpu") then
        add_deps("infiniop-iluvatar")
    end
+    if has_config("qy-gpu") then
+        add_deps("infiniop-qy")
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true})
+        add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true})
+    end

    if has_config("cambricon-mlu") then
        add_deps("infiniop-cambricon")
@@ -303,6 +324,9 @@ target("infiniccl")
    if has_config("iluvatar-gpu") then
        add_deps("infiniccl-iluvatar")
    end
+    if has_config("qy-gpu") then
+        add_deps("infiniccl-qy")
+    end

    if has_config("moore-gpu") then
        add_deps("infiniccl-moore")

--- a/xmake/qy.lua
+++ b/xmake/qy.lua
+local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
+if CUDNN_ROOT ~= nil then
+    add_includedirs(CUDNN_ROOT .. "/include")
+end
+
+add_includedirs("/usr/local/denglin/sdk/include", "../include")
+add_linkdirs("/usr/local/denglin/sdk/lib")
+add_links("curt", "cublas", "cudnn")
+set_languages("cxx17")
+add_cxxflags("-std=c++17")  -- 显式设置 C++17
+add_cuflags("--std=c++17",{force = true})  -- 确保 CUDA 编译器也使用 C++17
+rule("ignore.o")
+    set_extensions(".o")  -- 防止 xmake 默认处理
+    on_build_files(function () end)
+
+rule("qy.cuda")
+    set_extensions(".cu")
+
+    -- 缓存所有 .o 文件路径
+    local qy_objfiles = {}
+
+    on_load(function (target)
+        target:add("includedirs", "/usr/local/denglin/sdk/include")
+    end)
+
+    after_load(function (target)
+        -- 过滤 cudadevrt/cudart_static
+        local links = target:get("syslinks") or {}
+        local filtered = {}
+        for _, link in ipairs(links) do
+            if link ~= "cudadevrt" and link ~= "cudart_static" then
+                table.insert(filtered, link)
+            end
+        end
+        target:set("syslinks", filtered)
+    end)
+
+    on_buildcmd_file(function (target, batchcmds, sourcefile, opt)
+        import("core.project.project")
+        import("core.project.config")
+        import("core.base.option")
+
+        local dlcc = "/usr/local/denglin/sdk/bin/dlcc"
+        local sdk_path = "/usr/local/denglin/sdk"
+        local arch = "dlgput64"
+
+        local relpath = path.relative(sourcefile, project.directory())
+        local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o")
+
+        -- 🟢 强制注册 .o 文件给 target
+        target:add("objectfiles", objfile)
+        target:set("buildadd", true)
+        local argv = {
+            "-c", sourcefile,
+            "-o", objfile,
+            "--cuda-path=" .. sdk_path,
+            "--cuda-gpu-arch=" .. arch,
+            "-std=c++17", "-O2", "-fPIC"
+        }
+
+        for _, incdir in ipairs(target:get("includedirs") or {}) do
+            table.insert(argv, "-I" .. incdir)
+        end
+        for _, def in ipairs(target:get("defines") or {}) do
+            table.insert(argv, "-D" .. def)
+        end
+
+        batchcmds:mkdir(path.directory(objfile))
+        batchcmds:show_progress(opt.progress, "${color.build.object}compiling.dlcu %s", relpath)
+        batchcmds:vrunv(dlcc, argv)
+    end)
+target("infiniop-qy")
+    set_kind("static")
+    add_deps("infini-utils")
+    on_install(function (target) end)
+
+    add_rules("qy.cuda", {override = true})
+
+    if is_plat("windows") then
+        add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
+        add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
+        add_cxxflags("/FS")
+        if CUDNN_ROOT ~= nil then
+            add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
+        end
+    else
+        add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
+        add_cuflags("-Xcompiler=-fPIC")
+        add_cuflags("--extended-lambda")
+        add_culdflags("-Xcompiler=-fPIC")
+        add_cxxflags("-fPIC")
+        add_cuflags("--expt-relaxed-constexpr")
+        if CUDNN_ROOT ~= nil then
+            add_linkdirs(CUDNN_ROOT .. "/lib")
+        end
+    end
+
+    add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
+
+    set_languages("cxx17")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+
+    if has_config("ninetoothed") then
+        add_files("../build/ninetoothed/*.c")
+    end
+target_end()
+
+target("infinirt-qy")
+    set_kind("static")
+    add_deps("infini-utils")
+    on_install(function (target) end)
+    add_rules("qy.cuda", {override = true})
+    if is_plat("windows") then
+        add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
+        add_cxxflags("/FS")
+    else
+        add_cuflags("-Xcompiler=-fPIC")
+        add_culdflags("-Xcompiler=-fPIC")
+        add_cxflags("-fPIC")
+    end
+
+    set_languages("cxx17")
+    add_files("../src/infinirt/cuda/*.cu")
+target_end()
+
+target("infiniccl-qy")
+    set_kind("static")
+    add_deps("infinirt")
+    on_install(function (target) end)
+    if has_config("ccl") then
+        add_rules("qy.cuda", {override = true})
+        if not is_plat("windows") then
+            add_cuflags("-Xcompiler=-fPIC")
+            add_culdflags("-Xcompiler=-fPIC")
+            add_cxflags("-fPIC")
+
+            local nccl_root = os.getenv("NCCL_ROOT")
+            if nccl_root then
+                add_includedirs(nccl_root .. "/include")
+                add_links(nccl_root .. "/lib/libnccl.so")
+            else
+                add_links("nccl") -- Fall back to default nccl linking
+            end
+
+            add_files("../src/infiniccl/cuda/*.cu")
+        else
+            print("[Warning] NCCL is not supported on Windows")
+        end
+    end
+    set_languages("cxx17")
+
+target_end()