Init

295831a4 · Olivier Dehaene · 295831a4 · 295831a4 · 295831a4
Commit 295831a4 authored Oct 08, 2022 by Olivier Dehaene
Showing with 116 additions and 0 deletions

server/bloom_inference/utils.py server/bloom_inference/utils.py +95 -0

server/poetry.lock server/poetry.lock +0 -0

server/pyproject.toml server/pyproject.toml +21 -0

No files found.
--- a/server/bloom_inference/utils.py
+++ b/server/bloom_inference/utils.py
+import os
+import contextlib
+import torch
+import torch.distributed
+from transformers.generation_logits_process import (
+    LogitsProcessorList,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    TopKLogitsWarper,
+)
+class Sampling:
+    def __call__(self, logits):
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        return next_tokens
+class Greedy:
+    def __call__(self, logits):
+        return logits.argmax(dim=-1)
+class NextTokenChooser:
+    def __init__(self, temperature=1.0, top_k=None, top_p=None, do_sample=False):
+        warpers = LogitsProcessorList()
+        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+        # all samplers can be found in `generation_utils_samplers.py`
+        sampling = do_sample
+        if temperature is not None and temperature != 1.0:
+            temperature = float(temperature)
+            warpers.append(TemperatureLogitsWarper(temperature))
+            sampling = True
+        if top_k is not None and top_k != 0:
+            warpers.append(TopKLogitsWarper(top_k=top_k))
+            sampling = True
+        if top_p is not None and top_p < 1.0:
+            warpers.append(TopPLogitsWarper(top_p=top_p))
+            sampling = True
+        self.warpers = warpers
+        self.choice = Sampling() if sampling else Greedy()
+    def __call__(self, input_ids, scores):
+        scores = self.warpers(input_ids, scores)
+        next_ids = self.choice(scores)
+        return next_ids.unsqueeze(-1)
+class StoppingCriteria:
+    def __init__(self, max_new_tokens=20):
+        self.max_new_tokens = max_new_tokens
+        self.current_tokens = 0
+    def __call__(self, all_ids):
+        self.current_tokens += 1
+        if self.current_tokens >= self.max_new_tokens:
+            return True
+        return False
+def initialize_torch_distributed():
+    rank = int(os.getenv("RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    if torch.cuda.is_available():
+        # initialized `torch.distributed`
+        # Set the device id.
+        assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
+        device = rank % torch.cuda.device_count()
+        torch.cuda.set_device(device)
+        backend = "nccl"
+    else:
+        backend = "gloo"
+    # Call the init process.
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method="tcp://localhost:6000",
+    )
+    return torch.distributed.distributed_c10d._get_default_group(), rank, world_size
+@contextlib.contextmanager
+def set_default_dtype(dtype):
+    saved_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    try:
+        yield
+    finally:
+        torch.set_default_dtype(saved_dtype)
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
+[tool.poetry]
+name = "bloom-inference"
+version = "0.1.0"
+description = "BLOOM Inference Python gRPC Server"
+authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+[tool.poetry.dependencies]
+python = "^3.9"
+protobuf = "^4.21.7"
+grpcio = "^1.49.1"
+torch = "^1.12.1"
+typer = "^0.6.1"
+grpcio-reflection = "^1.49.1"
+accelerate = "^0.12.0"
+[tool.poetry.group.dev.dependencies]
+grpcio-tools = "^1.49.1"
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"