fix(server): fix multinomial implem in Sampling

4f6d038c · OlivierDehaene · a6c18c39 · 4f6d038c
Commit 4f6d038c authored May 11, 2023 by OlivierDehaene
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

server/text_generation_server/utils/tokens.py server/text_generation_server/utils/tokens.py +3 -3

No files found.
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -25,10 +25,10 @@ class Sampling:
    def __call__(self, logits):
        probs = torch.nn.functional.softmax(logits, -1)
+        # Avoid GPU<->CPU sync done by torch multinomial
        # See: https://github.com/pytorch/pytorch/blob/925a3788ec5c06db62ca732a0e9425a26a00916f/aten/src/ATen/native/Distributions.cpp#L631-L637
-        q = torch.empty_like(probs).exponential_(1, generator=self.generator).div_(probs)
+        q = torch.empty_like(probs).exponential_(1, generator=self.generator)
+        return probs.div_(q).argmax()
-        return q.argmax()
 class Greedy: