[XPU] Support Triton path for LoRA operations on XPU (#28511)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>

[XPU] Support Triton path for LoRA operations on XPU (#28511)
Signed-off-by: Fanli Lin <fanli.lin@intel.com>
dbbe0c75 · Fanli Lin · GitHub · 7dca0c90 · dbbe0c75 · dbbe0c75
Unverified Commit dbbe0c75 authored Nov 13, 2025 by Fanli Lin Committed by GitHub Nov 13, 2025
3 changed files
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -48,6 +48,7 @@ def _lora_expand_kernel(
    SLICE_NUM: tl.constexpr,
    SAME_STRIDE: tl.constexpr,
    USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
 ):
    cta_n_num = tl.cdiv(N, BLOCK_N)
    cta_m_num = tl.cdiv(M, BLOCK_M)

--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -46,6 +46,7 @@ def _lora_shrink_kernel(
    GROUP_SIZE_M: tl.constexpr,
    SLICE_NUM: tl.constexpr,
    USE_GDC: tl.constexpr,
+    launch_pdl: tl.constexpr,
 ):
    cta_n_num = tl.cdiv(N, BLOCK_N)
    cta_m_num = tl.cdiv(M, BLOCK_M)

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -101,7 +101,11 @@ class XPUPlatform(Platform):

    @classmethod
    def get_punica_wrapper(cls) -> str:
+        xpu_use_triton_kernel = os.getenv("XPU_USE_TRITON_KERNEL", "0") == "1"
+        if not xpu_use_triton_kernel:
            return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+        else:
+            return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int: