feat(server): do not use device_map auto on single GPU (#362)

e9669a40 · OlivierDehaene · GitHub · cfaa8580 · e9669a40 · e9669a40
Unverified Commit e9669a40 authored May 23, 2023 by OlivierDehaene Committed by GitHub May 23, 2023
Showing with 8 additions and 2 deletions

server/text_generation_server/models/causal_lm.py server/text_generation_server/models/causal_lm.py +4 -1

server/text_generation_server/models/seq2seq_lm.py server/text_generation_server/models/seq2seq_lm.py +4 -1

No files found.
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -468,9 +468,12 @@ class CausalLM(Model):
            model_id,
            revision=revision,
            torch_dtype=dtype,
-            device_map="auto" if torch.cuda.is_available() else None,
+            device_map="auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
            load_in_8bit=quantize == "bitsandbytes",
        )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+            model = model.cuda()
        tokenizer.pad_token_id = (
            model.config.pad_token_id
            if model.config.pad_token_id is not None

--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -518,9 +518,12 @@ class Seq2SeqLM(Model):
            model_id,
            revision=revision,
            torch_dtype=dtype,
-            device_map="auto" if torch.cuda.is_available() else None,
+            device_map="auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
            load_in_8bit=quantize == "bitsandbytes",
        )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+            model = model.cuda()
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, revision=revision, padding_side="left", truncation_side="left"
        )