fix(server): fix quantization

bf7f1d54 · OlivierDehaene · 49a6c8c1 · bf7f1d54 · bf7f1d54 · bf7f1d54
Commit bf7f1d54 authored May 30, 2023 by OlivierDehaene
4 changed files
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -245,14 +245,12 @@ class BLOOMSharded(BLOOM):
                                return linear

                            module.linear = replace_linear(state)
-                        elif quantize == "gptq":
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq":
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None:
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")

                    module._parameters[param_name] = tensor
                    if name == "word_embeddings.weight":

--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -364,14 +364,12 @@ class GalacticaSharded(Galactica):
                                return linear

                            module.linear = replace_linear(state)
-                        elif quantize == "gptq":
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq":
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None:
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")

                    module._parameters[param_name] = tensor
                    if name == "model.decoder.embed_tokens.weight":

--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -210,14 +210,12 @@ class GPTNeoxSharded(CausalLM):
                                return linear

                            module.linear = replace_linear(state)
-                        elif quantize == "gptq":
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq":
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None:
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")

                    if current_parameter_tensor is not None:
                        module._parameters[param_name] = tensor

--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -223,14 +223,12 @@ class T5Sharded(Seq2SeqLM):

                            module.linear = replace_linear(state)

-                        elif quantize == "gptq" and not module_name.endswith("wo"):
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq" and not module_name.endswith("wo"):
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None or module_name.endswith("wo"):
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")

                    if current_parameter_tensor is not None:
                        module._parameters[param_name] = tensor