vllm推理时Triton编译报错 (#2) · Issues · ModelZoo / Qwen3_pytorch

vllm推理时Triton编译报错

硬件为海光Z100*7 使用docker镜像运行，安装了transformers==4.51.0。已经确认所有库版本与README.md中列出的相同。
报错内容
loc("/tmp/torchinductor_root/u3/cu3ylvtuhboef27w4somh3zkah4usi4owcutsnghaqnzzthttm76.py":18:0): error: unsupported target: 'gfx906'
loc("/tmp/torchinductor_root/yt/cythmg6p65pbebqbljvdaal5iochwg5zbprfvfziuvfce5xvhcc7.py":18:0): error: unsupported target: 'gfx906'
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0] Triton compilation failed: triton_poi_fused_add_bitwise_and_bitwise_not_bitwise_or_ge_lt_mul_sub_0
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0] def triton_(in_ptr0, out_ptr0, out_ptr1, ks0, ks1, ks2, ks3, xnumel, XBLOCK : tl.constexpr):
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     xoffset = tl.program_id(0) * XBLOCK
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     xmask = xindex < xnumel
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     x0 = xindex
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp0 = tl.load(in_ptr0 + (x0), xmask)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp1 = ks0
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp2 = tmp0 >= tmp1
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp3 = ks1
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp4 = tmp0 < tmp3
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp5 = tmp2 & tmp4
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp6 = ks2
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp7 = tmp0 >= tmp6
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp8 = ks3
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp9 = tmp0 < tmp8
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp10 = tmp7 & tmp9
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp11 = tmp5 | tmp10
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp12 = tmp11.to(tl.int64)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp13 = tmp5.to(tl.int64)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp14 = tmp13 * tmp1
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp15 = tmp10.to(tl.int64)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp16 = ks0 + ks2 + ((-1)*ks1)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp17 = tmp15 * tmp16
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp18 = tmp14 + tmp17
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp19 = tmp0 - tmp18
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp20 = tmp12 * tmp19
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tmp21 = tmp11 == 0
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tl.store(out_ptr0 + (x0), tmp20, xmask)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     tl.store(out_ptr1 + (x0), tmp21, xmask)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0] metadata: {'signature': {'in_ptr0': '*i64', 'out_ptr0': '*i64', 'out_ptr1': '*i1', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'xnumel': 'i32'}, 'device': 1, 'constants': {'XBLOCK': 512}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=(), within_2gb=set(), non_negative=set())], 'device_type': 'hip', 'num_warps': 8, 'num_stages': 1, 'debug': False, 'cc': 'gfx906'}
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0] Traceback (most recent call last):
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]   File "/usr/local/lib/python3.10/dist-packages/torch/_inductor/runtime/triton_heuristics.py", line 431, in _precompile_config
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     binary = triton.compile(*compile_args, **compile_kwargs)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]   File "/usr/local/lib/python3.10/dist-packages/triton/compiler/compiler.py", line 311, in compile
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     next_module = compile_ir(module, metadata)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]   File "/usr/local/lib/python3.10/dist-packages/triton/backends/amd/compiler.py", line 326, in <lambda>
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]   File "/usr/local/lib/python3.10/dist-packages/triton/backends/amd/compiler.py", line 255, in make_llir
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0]     pm.run(mod)
(VllmWorkerProcess pid=894) [rank1]:E0502 16:26:47.688000 140530375006016 torch/_inductor/runtime/triton_heuristics.py:433] [0/0] RuntimeError: PassManager::run failed