feat(server): Rework model loading (#344)

# What does this PR do? Reworked the loading logic. Idea is to use cleaner loading code: - Remove need for `no_init_weights` - Remove all weird `bnb_linear` and `load_weights` and `post_load_weights`. New code layout: - New class `Weights` in charge of handling loading the weights from multiple files into appropiate tensors (potentially sharded) - TP layers now are "shells", they contain the code to know what kind of sharding we need + eventual `all_reduce`. They do not inherit from linear, but they contain some kind of Linear instead - the contained linear can be either FastLinear, BnbLinear or GPTq Linear next. - All modeling code is explictly made for sharding, process group is just no-ops for non sharded code (removes a lot of test cases) ![Screenshot from 2023-05-19 23-19-59](https://github.com/huggingface/text-generation-inference/assets/204321/9a802654-74a3-488c-87a8-073743a6143f) --------- Co-authored-by: Ubuntu <ubuntu@ip-1...

feat(server): Rework model loading (#344)
# What does this PR do? Reworked the loading logic. Idea is to use cleaner loading code: - Remove need for `no_init_weights` - Remove all weird `bnb_linear` and `load_weights` and `post_load_weights`. New code layout: - New class `Weights` in charge of handling loading the weights from multiple files into appropiate tensors (potentially sharded) - TP layers now are "shells", they contain the code to know what kind of sharding we need + eventual `all_reduce`. They do not inherit from linear, but they contain some kind of Linear instead - the contained linear can be either FastLinear, BnbLinear or GPTq Linear next. - All modeling code is explictly made for sharding, process group is just no-ops for non sharded code (removes a lot of test cases) ![Screenshot from 2023-05-19 23-19-59](https://github.com/huggingface/text-generation-inference/assets/204321/9a802654-74a3-488c-87a8-073743a6143f) --------- Co-authored-by: Ubuntu <ubuntu@ip-1...
abd58ff8 · Nicolas Patry · GitHub · 19c41824 · abd58ff8 · abd58ff8
Unverified Commit abd58ff8 authored Jun 08, 2023 by Nicolas Patry Committed by GitHub Jun 08, 2023
3 changed files
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -4,6 +4,37 @@ import torch
 from datetime import timedelta
+class FakeBarrier:
+    def wait(self):
+        pass
+class FakeGroup:
+    def __init__(self, rank, size):
+        self._rank = rank
+        self._size = size
+    def allreduce(self, *args, **kwargs):
+        return FakeBarrier()
+    def allgather(self, inputs, local_tensor, **kwargs):
+        assert (
+            len(inputs[0]) == len(local_tensor) == 1
+        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
+        for input_ in inputs:
+            input_[0].data = local_tensor[0].data
+        return FakeBarrier()
+    def barrier(self, *args, **kwargs):
+        return FakeBarrier()
+    def size(self):
+        return self._size
+    def rank(self):
+        return self._rank
 def initialize_torch_distributed():
    rank = int(os.getenv("RANK", "0"))
    world_size = int(os.getenv("WORLD_SIZE", "1"))
@@ -23,13 +54,18 @@ def initialize_torch_distributed():
        backend = "gloo"
        options = None
-    # Call the init process.
+    if world_size == 1:
-    torch.distributed.init_process_group(
+        return FakeGroup(rank, world_size), rank, world_size
-        backend=backend,
+    else:
-        world_size=world_size,
+        if os.getenv("DEBUG", None) == "1":
-        rank=rank,
+            return FakeGroup(rank, world_size), rank, world_size
-        timeout=timedelta(seconds=60),
+        # Call the init process.
-        pg_options=options,
+        torch.distributed.init_process_group(
-    )
+            backend=backend,
+            world_size=world_size,
+            rank=rank,
+            timeout=timedelta(seconds=60),
+            pg_options=options,
+        )
-    return torch.distributed.group.WORLD, rank, world_size
+        return torch.distributed.group.WORLD, rank, world_size
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
+from pathlib import Path
+from typing import List
+from safetensors import safe_open
+class Weights:
+    def __init__(self, filenames: List[Path], device, dtype, process_group):
+        routing = {}
+        for filename in filenames:
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+        self.routing = routing
+        self.device = device
+        self.dtype = dtype
+        self.process_group = process_group
+        self._handles = {}
+    def _get_handle(self, filename):
+        if filename not in self._handles:
+            f = safe_open(filename, framework="pytorch")
+            self._handles[filename] = f
+        return self._handles[filename]
+    def get_filename(self, tensor_name: str) -> str:
+        filename = self.routing.get(tensor_name, None)
+        if filename is None:
+            raise RuntimeError(f"weight {tensor_name} does not exist")
+        return str(filename)
+    def _get_slice(self, tensor_name: str):
+        filename = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
+        return slice_
+    def get_shape(self, tensor_name: str):
+        return self._get_slice(tensor_name).get_shape()
+    def get_tensor(self, tensor_name: str):
+        filename = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        tensor = f.get_tensor(tensor_name)
+        tensor = tensor.to(dtype=self.dtype)
+        tensor = tensor.to(device=self.device)
+        return tensor
+    def get_sharded(self, tensor_name: str, dim: int):
+        filename = self.get_filename(tensor_name)
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
+        size = slice_.get_shape()[dim]
+        block_size = size // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        assert (
+            size % world_size == 0
+        ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
+        if dim == 0:
+            tensor = slice_[start:stop]
+        elif dim == 1:
+            tensor = slice_[:, start:stop]
+        else:
+            raise NotImplementedError("Let's make that generic when needed")
+        tensor = tensor.to(dtype=self.dtype)
+        tensor = tensor.to(device=self.device)
+        return tensor