@@ -1644,25 +1646,25 @@ class FlashCausalLM(Model):
f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen)forseqlenintuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
)
torch.cuda.tunable.set_filename(
tunableop_filepath,insert_device_ordinal=False
)
# torch.cuda.tunable.set_filename(
# tunableop_filepath, insert_device_ordinal=False
# )
ifos.path.isfile(tunableop_filepath):
log_master(
logger.info,
f"The file {tunableop_filepath} already exists and will be reused.",
)
torch.cuda.tunable.read_file(tunableop_filepath)
# if os.path.isfile(tunableop_filepath):
# log_master(
# logger.info,
# f"The file {tunableop_filepath} already exists and will be reused.",