Unverified Commit e33192b2 authored by cwazai's avatar cwazai Committed by GitHub
Browse files

[lora/moe] Improve fused MoE‑LoRA kernel indexing and memory access (#32770)


Signed-off-by: default avatar陈建华 <1647430658@qq.com>
Signed-off-by: default avatarYanwen Lin <lyw1124278064@gmail.com>
Signed-off-by: default avatarkimheesu <wlskaka4@gmail.com>
Signed-off-by: default avatarDivakar Verma <divakar.verma@amd.com>
Signed-off-by: default avatarRobert Shaw <robshaw@redhat.com>
Signed-off-by: default avatarganyi <ygan@amd.com>
Signed-off-by: default avatarwhx-sjtu <2952154980@qq.com>
Signed-off-by: default avatarelvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: default avatarDaniel Serebrenik <daserebrenik@nvidia.com>
Signed-off-by: default avatarYanan Cao <gmagogsfm@gmail.com>
Signed-off-by: default avatarXin Yang <xyangx@amazon.com>
Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarMatthew Wong <Matthew.Wong2@amd.com>
Signed-off-by: default avatarknlnguyen1802 <knlnguyen1802@gmail.com>
Signed-off-by: default avatarIfta Khairul Alam Adil <ikaadil007@gmail.com>
Signed-off-by: default avatarIfta khairul Alam Adil <25082512+ikaadil@users.noreply.github.com>
Signed-off-by: default avatarLucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
Signed-off-by: default avatarHuy Do <huydhn@gmail.com>
Signed-off-by: default avatarMicah Williamson <micah.williamson@amd.com>
Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
Signed-off-by: default avatarKebe <mail@kebe7jun.com>
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarAlex Sun <alex.s@amd.com>
Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: default avatarLiran Schour <lirans@il.ibm.com>
Signed-off-by: default avatarliranschour <liranschour@users.noreply.github.com>
Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarNickLucche <nlucches@redhat.com>
Signed-off-by: default avatarShengqi Chen <harry-chen@outlook.com>
Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: default avatarOr Ozeri <oro@il.ibm.com>
Signed-off-by: default avatarLucas Kabela <lucaskabela@meta.com>
Signed-off-by: default avatarRichard Zou <zou3519@gmail.com>
Signed-off-by: default avatarMax de Bayser <mbayser@br.ibm.com>
Signed-off-by: default avatarMax de Bayser <maxdebayser@gmail.com>
Signed-off-by: default avatarAuYang <459461160@qq.com>
Signed-off-by: default avatarVadim Gimpelson <vadim.gimpelson@gmail.com>
Signed-off-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: default avatarrickychen-infinirc <ricky.chen@infinirc.com>
Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
Signed-off-by: default avatarFadi Arafeh <fadi.arafeh@arm.com>
Signed-off-by: default avatarEldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
Signed-off-by: default avatareldarkurtic <8884008+eldarkurtic@users.noreply.github.com>
Signed-off-by: default avatarBill Nell <bnell@redhat.com>
Signed-off-by: default avatarRishabhSaini <rishabhsaini01@gmail.com>
Signed-off-by: default avatarLuka Govedič <lgovedic@redhat.com>
Signed-off-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: default avatarKaran Bansal <karanb192@gmail.com>
Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
Signed-off-by: default avatarLi, Jiang <bigpyj64@gmail.com>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
Signed-off-by: default avatarTianshu Yu <tianshuyu.formal@gmail.com>
Signed-off-by: default avatarraushan <raushan@huggingface.co>
Signed-off-by: default avatarbaonudesifeizhai <baonudesifeizhai@gmail.com>
Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
Signed-off-by: default avatarsangbumlikeagod <oironese@naver.com>
Signed-off-by: default avatarsangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Signed-off-by: default avatarMatteo Fari <matteofari06@gmail.com>
Signed-off-by: default avatarhuanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com>
Signed-off-by: default avatarChen Zhang <zhangch99@outlook.com>
Signed-off-by: default avatarOrion Reblitz-Richardson <orionr@meta.com>
Signed-off-by: default avatarOrion Reblitz-Richardson <orionr@gmail.com>
Signed-off-by: default avatarmarksverdhei <marksverdhei@hotmail.com>
Signed-off-by: default avatarMarkus / Mark <46672778+marksverdhei@users.noreply.github.com>
Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
Signed-off-by: default avatarRandall Smith <ransmith@amd.com>
Signed-off-by: default avatarjon <joninco@bullpoint.org>
Signed-off-by: default avatardolpm <34420038+dolpm@users.noreply.github.com>
Signed-off-by: default avatarElizaWszola <ewszola@redhat.com>
Signed-off-by: default avatarLuka Govedič <luka.govedic@gmail.com>
Signed-off-by: default avatarJoe Runde <Joseph.Runde@ibm.com>
Signed-off-by: default avatarmohammad najafi <mohammad.najafi@amd.com>
Signed-off-by: default avatarMichael Goin <mgoin64@gmail.com>
Signed-off-by: default avatar7. Sun <jhao.sun@gmail.com>
Signed-off-by: default avataresmeetu <jasonailu87@gmail.com>
Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
Signed-off-by: default avatarReagan <reaganjlee@gmail.com>
Signed-off-by: default avatarReagan Lee <96998476+reaganjlee@users.noreply.github.com>
Signed-off-by: default avatarHongjian Zhang <zhanghongjian@xiaohongshu.com>
Signed-off-by: default avatarXingran Wang <wangxingran123456@outlook.com>
Signed-off-by: default avatarHiroken. <105287758+HirokenOvo@users.noreply.github.com>
Signed-off-by: default avatarLukas Geiger <lukas.geiger94@gmail.com>
Signed-off-by: default avatarTsai, Louie <louie.tsai@intel.com>
Signed-off-by: default avatarLouie Tsai <louie.tsai@intel.com>
Signed-off-by: default avatarMaryam Tahhan <mtahhan@redhat.com>
Signed-off-by: default avatarJoshua Deng <joshuakdeng@gmail.com>
Signed-off-by: default avatartjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: default avatarLopezCastroRoberto <rocastro@redhat.com>
Signed-off-by: default avatarJJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Signed-off-by: default avatarJee Jee Li <pandaleefree@gmail.com>
Signed-off-by: default avatarcwazai <38356712+cwazai@users.noreply.github.com>
Co-authored-by: default avatarYanwen Lin <lyw1124278064@gmail.com>
Co-authored-by: default avatarKim Hee Su <wlskaka4@gmail.com>
Co-authored-by: default avatarDivakar Verma <137818590+divakar-amd@users.noreply.github.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: default avatarRobert Shaw <robshaw@redhat.com>
Co-authored-by: default avatarPleaplusone <ygan@amd.com>
Co-authored-by: default avatarwhx <56632993+whx-sjtu@users.noreply.github.com>
Co-authored-by: default avatarelvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: default avatarNick Hill <nhill@redhat.com>
Co-authored-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: default avatardanisereb <daserebrenik@nvidia.com>
Co-authored-by: default avatarYanan Cao <gmagogsfm@users.noreply.github.com>
Co-authored-by: default avatarXin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatarMatt <156021403+mawong-amd@users.noreply.github.com>
Co-authored-by: default avatarknlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: default avatarLucain <lucainp@gmail.com>
Co-authored-by: default avatarIfta khairul Alam Adil <25082512+ikaadil@users.noreply.github.com>
Co-authored-by: default avatarLucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: default avatarHuy Do <huydhn@gmail.com>
Co-authored-by: default avatarMicah Williamson <micah.williamson@amd.com>
Co-authored-by: default avatarAndreas Karatzas <akaratza@amd.com>
Co-authored-by: default avatarMatthew Wong <Matthew.Wong2@amd.com>
Co-authored-by: default avatarKebe <mail@kebe7jun.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: default avatarAlex Sun <minchsun@amd.com>
Co-authored-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: default avatarliranschour <liranschour@users.noreply.github.com>
Co-authored-by: default avatarOr Ozeri <or@ozery.com>
Co-authored-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: default avatarShengqi Chen <harry-chen@outlook.com>
Co-authored-by: default avatarChauncey <chaunceyjiang@gmail.com>
Co-authored-by: default avatarOr Ozeri <oro@il.ibm.com>
Co-authored-by: default avatarLucas Kabela <lucaskabela@meta.com>
Co-authored-by: default avatarRichard Zou <zou3519@users.noreply.github.com>
Co-authored-by: default avatarMaximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: default avatarXu Jinyang <72930776+AuYang261@users.noreply.github.com>
Co-authored-by: default avatarVadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: default avatarDavid Ramon Prados <davidramon3@hotmail.es>
Co-authored-by: default avatarRickyChen / 陳昭儒 <ricky.chen@infinirc.com>
Co-authored-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
Co-authored-by: default avatarFadi Arafeh <115173828+fadara01@users.noreply.github.com>
Co-authored-by: default avatarEldar Kurtić <8884008+eldarkurtic@users.noreply.github.com>
Co-authored-by: default avatarbnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: default avatarRishabh Saini <rishabhsaini01@gmail.com>
Co-authored-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
Co-authored-by: default avatarKaran Bansal <karanb192@users.noreply.github.com>
Co-authored-by: default avatarLi, Jiang <jiang1.li@intel.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: default avatartianshu-Michael-yu <101950379+tianshu-Michael-yu@users.noreply.github.com>
Co-authored-by: default avatarRaushan Turganbay <raushan@huggingface.co>
Co-authored-by: default avatarbaonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Co-authored-by: default avatarMark McLoughlin <markmc@redhat.com>
Co-authored-by: default avatarsangbumlikeagod <98077576+sangbumlikeagod@users.noreply.github.com>
Co-authored-by: default avatarMatteo Fari <matteofari06@gmail.com>
Co-authored-by: default avatarHarry Huang <vastrockhuang162@gmail.com>
Co-authored-by: default avatarChen Zhang <zhangch99@outlook.com>
Co-authored-by: default avatarOrion Reblitz-Richardson <orionr@gmail.com>
Co-authored-by: default avatarKevin H. Luu <khluu000@gmail.com>
Co-authored-by: default avatarMarkus / Mark <46672778+marksverdhei@users.noreply.github.com>
Co-authored-by: default avatarClaude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: default avatarrasmith <Randall.Smith@amd.com>
Co-authored-by: default avatarRandall Smith <ransmith@amd.com>
Co-authored-by: default avatarjoninco <joninco@bullpoint.org>
Co-authored-by: default avatardolpm <34420038+dolpm@users.noreply.github.com>
Co-authored-by: default avatarElizaWszola <ewszola@redhat.com>
Co-authored-by: default avatarVarun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: default avatarLuka Govedič <luka.govedic@gmail.com>
Co-authored-by: default avatarLucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: default avatarLuka Govedič <lgovedic@redhat.com>
Co-authored-by: default avatarJoe Runde <Joseph.Runde@ibm.com>
Co-authored-by: default avatarmonajafi-amd <mohammad.najafi@amd.com>
Co-authored-by: default avatarruizcrp <ruiz.crp@gmail.com>
Co-authored-by: default avatarShengqi Chen <i@harrychen.xyz>
Co-authored-by: default avatar7. Sun <jhao.sun@gmail.com>
Co-authored-by: default avatarRoy Wang <jasonailu87@gmail.com>
Co-authored-by: default avatarRoger Wang <hey@rogerw.io>
Co-authored-by: default avatarReagan Lee <96998476+reaganjlee@users.noreply.github.com>
Co-authored-by: default avatarHiroken. <105287758+HirokenOvo@users.noreply.github.com>
Co-authored-by: default avatarXingran Wang <wangxingran123456@outlook.com>
Co-authored-by: default avatardavid guan <102001211+Chenhao-Guan@users.noreply.github.com>
Co-authored-by: default avatarLukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: default avatarLouie Tsai <louie.tsai@intel.com>
Co-authored-by: default avatarMaryam Tahhan <mtahhan@redhat.com>
Co-authored-by: default avatarJoshua Deng <91448271+joshuadeng@users.noreply.github.com>
Co-authored-by: default avatarNick Hill <nickhill123@gmail.com>
Co-authored-by: default avatarTJian <tunjian.tan@embeddedllm.com>
Co-authored-by: default avatarRoberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: default avatarJJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: default avatarJee Jee Li <pandaleefree@gmail.com>
parent 61274bde
......@@ -62,6 +62,7 @@ def _fused_moe_lora_kernel(
num_experts,
lora_ids,
adapter_enabled,
max_loras, # <<< PR2: rename, used for masks when grid axis-2 != max_loras
# The stride variables represent how much to increase the ptr by when
# moving by 1 element in a particular dimension. E.g. `stride_am` is
# how much to increase `a_ptr` by to get the element one row down
......@@ -83,6 +84,7 @@ def _fused_moe_lora_kernel(
num_slice_c: tl.constexpr,
top_k: tl.constexpr,
MUL_ROUTED_WEIGHT: tl.constexpr,
USE_B_L2_CACHE: tl.constexpr, # new, enable .ca load for B
BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr,
BLOCK_SIZE_K: tl.constexpr,
......@@ -104,10 +106,13 @@ def _fused_moe_lora_kernel(
if moe_enabled == 0:
# Early exit for the no moe lora case.
return
# The grid size on axis 2 is (max_loras + 1) to handle the no-lora case
# (lora_id == -1), but sorted_token_ids and expert_ids are allocated with
# shape (max_loras, ...). Use (num_programs - 1) for correct bounds checking.
max_loras = tl.num_programs(axis=2) - 1
# The grid's axis-2 dimension is max_loras + 1 to accommodate the -1 sentinel.
# This guard ensures we don't access sorted_token_ids / expert_ids /
# num_tokens_post_padded beyond their allocated bounds if an invalid
# lora_id somehow appears. Although the caller should pass correct
# max_loras, defensive programming prevents accidental out-of-bounds.
if lora_id >= max_loras:
return
grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
# calculate pid_m,pid_n
......@@ -136,10 +141,11 @@ def _fused_moe_lora_kernel(
cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
# remove modulo wrap-around
offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)
offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int32)
token_ind = stride_tl * lora_id + offs_token_id
offs_token = tl.load(
sorted_token_ids_ptr + token_ind,
......@@ -176,7 +182,13 @@ def _fused_moe_lora_kernel(
# GDC wait waits for ALL programs in the prior kernel to complete
# before continuing.
# pre-fetch lora weight
b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
# add (offs_bn < N) mask; optional .ca for B
b_mask = (offs_k[:, None] < k_remaining) & (offs_bn[None, :] < N)
if USE_B_L2_CACHE:
b = tl.load(b_ptrs, mask=b_mask, other=0.0, cache_modifier=".ca")
else:
b = tl.load(b_ptrs, mask=b_mask, other=0.0)
if USE_GDC and not IS_PRIMARY:
tl.extra.cuda.gdc_wait()
a = tl.load(
......@@ -276,6 +288,7 @@ def _fused_moe_lora_shrink(
num_experts,
lora_ids,
adapter_enabled,
lora_a_stacked[0].shape[0],
qcurr_hidden_states.stride(0),
qcurr_hidden_states.stride(1),
w1_lora_a_stacked.stride(0),
......@@ -292,6 +305,7 @@ def _fused_moe_lora_shrink(
num_slice_c=num_slices,
top_k=1 if mul_routed_weight else top_k_num,
MUL_ROUTED_WEIGHT=False,
USE_B_L2_CACHE=True, # new
IS_PRIMARY=True,
**shrink_config,
)
......@@ -377,6 +391,7 @@ def _fused_moe_lora_expand(
num_experts,
lora_ids,
adapter_enabled,
lora_b_stacked[0].shape[0],
a_intermediate_cache1.stride(0),
a_intermediate_cache1.stride(1),
w1_lora_b_stacked.stride(0),
......@@ -393,6 +408,7 @@ def _fused_moe_lora_expand(
num_slice_c=num_slices,
top_k=1,
MUL_ROUTED_WEIGHT=mul_routed_weight,
USE_B_L2_CACHE=True, # new
IS_PRIMARY=False,
**expand_config,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment