Unverified Commit 5f32dea1 authored by drbh's avatar drbh Committed by GitHub
Browse files

fix: prefer inplace softmax to avoid copy (#2661)



* fix: prefer inplace softmax to avoid copy

* Update server/text_generation_server/models/flash_causal_lm.py
Co-authored-by: default avatarNicolas Patry <patry.nicolas@protonmail.com>

---------
Co-authored-by: default avatarNicolas Patry <patry.nicolas@protonmail.com>
parent 1b97e084
......@@ -1922,8 +1922,9 @@ class FlashCausalLM(Model):
batch.adapter_meta.adapter_indices = next_adapter_indices
if prefill and prefill_logprobs:
# Get prefill logprobs
prefill_logprobs_tensor = torch.log_softmax(out, -1)
# Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
torch.log_softmax(out, -1, out=out)
prefill_logprobs_tensor = out
prefill_logprobs = torch.gather(
prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment