Merge pull request #1019 from InfiniTensor/issue/1008

Issue/1008

Merge pull request #1019 from InfiniTensor/issue/1008
Issue/1008
52f0dcf0 · thatPepe · GitHub · d0f405ce · 68026bd1 · 52f0dcf0
Unverified Commit 52f0dcf0 authored Feb 12, 2026 by thatPepe Committed by GitHub Feb 12, 2026
12 changed files
--- a/test/infinicore/ops/upsample_bilinear.py
+++ b/test/infinicore/ops/upsample_bilinear.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/upsample_nearest.py
+++ b/test/infinicore/ops/upsample_nearest.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/vander.py
+++ b/test/infinicore/ops/vander.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner

 # Test cases format: (input_shape, input_strides_or_None, N)

--- a/test/infinicore/ops/var.py
+++ b/test/infinicore/ops/var.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/var_mean.py
+++ b/test/infinicore/ops/var_mean.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/vdot.py
+++ b/test/infinicore/ops/vdot.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner

 # Test cases format: (vec1_shape, vec2_shape, vec1_strides_or_None, vec2_strides_or_None)

--- a/test/infinicore/ops/where.py
+++ b/test/infinicore/ops/where.py
@@ -3,8 +3,8 @@ import os

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner

 # Test cases format: (condition_shape, cond_strides_or_None, x_shape_or_None, y_shape_or_None)

--- a/test/infiniop/gelu.py
+++ b/test/infiniop/gelu.py
@@ -15,6 +15,7 @@ from libinfiniop import (
    InfiniDtype,
    InfiniDtypeNames,
    InfiniDeviceNames,
+    InfiniDeviceEnum,
    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto
@@ -83,6 +84,12 @@ def test(
    dtype=torch.float16,
    sync=None,
 ):
+    # Skip strided cases on Iluvatar: GELU with non-contiguous tensors can hang the GPU (requires ixsmi -r to recover)
+    if device == InfiniDeviceEnum.ILUVATAR and (
+        input_stride is not None or output_stride is not None
+    ):
+        return
+
    input = TestTensor(shape, input_stride, dtype, device)
    if inplace == Inplace.INPLACE:
        if input_stride != output_stride:
@@ -141,6 +148,9 @@ def test(

    lib_gelu()

+    if sync is not None:
+        sync()
+
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:
        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)

--- a/test/infiniop/ones.py
+++ b/test/infiniop/ones.py
@@ -15,6 +15,7 @@ from libinfiniop import (
    InfiniDtype,
    InfiniDtypeNames,
    InfiniDeviceNames,
+    InfiniDeviceEnum,
    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto
@@ -112,6 +113,12 @@ def test(
        dtype=None,
        sync=None,
 ):
+    # Skip strided cases on Iluvatar: Ones with non-contiguous tensors can hang the GPU (requires ixsmi -r to recover)
+    if device == InfiniDeviceEnum.ILUVATAR and (
+        x_stride is not None or y_stride is not None
+    ):
+        return
+
    if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
        x = TestTensor(shape, x_stride, dtype, device)
    elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,

--- a/test/infiniop/zeros.py
+++ b/test/infiniop/zeros.py
@@ -15,6 +15,7 @@ from libinfiniop import (
    InfiniDtype,
    InfiniDtypeNames,
    InfiniDeviceNames,
+    InfiniDeviceEnum,
    infiniopOperatorDescriptor_t,
 )
 from enum import Enum, auto
@@ -114,6 +115,12 @@ def test(
        dtype=None,
        sync=None,
 ):
+    # Skip strided cases on Iluvatar: Zeros with non-contiguous tensors can hang the GPU (requires ixsmi -r to recover)
+    if device == InfiniDeviceEnum.ILUVATAR and (
+        x_stride is not None or y_stride is not None
+    ):
+        return
+
    if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
        x = TestTensor(shape, x_stride, dtype, device)
    elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,

--- a/xmake.lua
+++ b/xmake.lua
@@ -115,10 +115,12 @@ option("iluvatar-gpu")
    set_description("Whether to compile implementations for Iluvatar GPU")
 option_end()

-option("ivcore-20")
-    set_default(false)
+option("iluvatar_arch")
+    set_default("ivcore20")
    set_showmenu(true)
-    set_description("Use ivcore20")
+    set_description("Set Iluvatar GPU architecture (e.g. ivcore20)")
+    set_values("ivcore20")
+    set_category("option")
 option_end()

 if has_config("iluvatar-gpu") then

--- a/xmake/iluvatar.lua
+++ b/xmake/iluvatar.lua
-toolchain("iluvatar.toolchain")
+local iluvatar_arch = get_config("iluvatar_arch") or "ivcore20"
+
+toolchain("iluvatar.toolchain")
    set_toolset("cc"  , "clang"  )
    set_toolset("cxx" , "clang++")
    set_toolset("cu"  , "clang++")
@@ -44,15 +46,15 @@ target("infiniop-iluvatar")
    set_warnings("all", "error")
    add_cuflags("-Wno-error=unused-private-field", "-Wno-error=unused-variable", "-Wno-unused-variable")
    add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
-    if has_config("ivcore-20") then
-        add_cuflags("--cuda-gpu-arch=ivcore20", {force = true})
-    end
+    add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
    add_culdflags("-fPIC")
    add_cxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable")
    add_cxxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable")

    -- set_languages("cxx17") 天数似乎不能用这个配置
    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+    -- skip scaled_mm, adapt it later
+    -- remove_files("../src/infiniop/ops/scaled_mm/nvidia/*.cu")

    -- 天数平台不支持部分 NVIDIA PTX 指令，AWQ 反量化改用 CUDA C++ 实现
    add_files("../src/infiniop/ops/dequantize_awq/iluvatar/*.cu")
@@ -75,6 +77,7 @@ target("infinirt-iluvatar")

    set_warnings("all", "error")
    add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
+    add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
    add_culdflags("-fPIC")
    add_cxflags("-fPIC")
    add_cxxflags("-fPIC")
@@ -97,6 +100,7 @@ target("infiniccl-iluvatar")

        set_warnings("all", "error")
        add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
+        add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
        add_culdflags("-fPIC")
        add_cxflags("-fPIC")
        add_cxxflags("-fPIC")