# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
# For RoCm, we always use float cos/sin to avoid a cast.
# For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
# But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
)
ifmodel_type==MAMBA:
returnMamba(
model_id,
revision,
quantize=quantize,
use_medusa=use_medusa,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
...
...
@@ -212,14 +449,14 @@ def get_model(
model_id,
revision,
quantize=quantize,
use_medusa=use_medusa,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
if(
model_type=="gpt_bigcode"
ormodel_type=="gpt2"
model_type==GPT_BIGCODE
ormodel_type==GPT2
andmodel_id.startswith("bigcode/")
):
ifFLASH_ATTENTION:
...
...
@@ -227,7 +464,7 @@ def get_model(
model_id,
revision,
quantize=quantize,
use_medusa=use_medusa,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
...
...
@@ -240,37 +477,69 @@ def get_model(
model_id,
revision,
quantize=quantize,
use_medusa=use_medusa,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
ifmodel_type=="bloom":
ifmodel_type==BLOOM:
returnBLOOMSharded(
model_id,
revision,
quantize=quantize,
use_medusa=use_medusa,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
elifmodel_type=="mpt":
elifmodel_type==MPT:
returnMPTSharded(
model_id,
revision,
quantize=quantize,
use_medusa=use_medusa,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
elifmodel_type=="gpt_neox":
elifmodel_type==GPT2:
ifFLASH_ATTENTION:
try:
returnFlashGPT2(
model_id,
revision,
quantize=quantize,
speculator=speculator,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
exceptRuntimeErrorase:
# Lots of legacy models with various weight names.
# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
...
...
@@ -73,7 +82,7 @@ class CohereRotary(PositionRotaryEmbedding):
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
...
...
@@ -90,7 +99,7 @@ class CohereLayerNorm(nn.Module):