working

107c29e8 · mshoeybi · 2f08c0c3 · 107c29e8 · 107c29e8
Commit 107c29e8 authored Sep 22, 2021 by mshoeybi
Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 2 deletions

megatron/inference/communication.py megatron/inference/communication.py +49 -0

megatron/inference/tokenization.py megatron/inference/tokenization.py +38 -2

No files found.
--- a/megatron/inference/communication.py
+++ b/megatron/inference/communication.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Communications utilities."""
+import torch
+def broadcast_tensor(size, dtype, tensor=None, rank=0):
+    """ Given size and type of a tensor on all ranks and the tensor value
+        only on a specific rank, broadcast from that rank to all other ranks.
+    """
+    if torch.distributed.get_rank() == rank:
+        assert tensor is not None
+        assert tensor.is_cuda
+    else:
+        tensor = torch.empty(size,
+                             dtype=dtype,
+                             device=torch.cuda.current_device())
+    torch.distributed.broadcast(tensor, rank)
+    return tensor
+def broadcast_int_list(size, int_list=None, rank=0):
+    """Broadcast a list of interger values."""
+    long_tensor = None
+    if torch.distributed.get_rank() == rank:
+        long_tensor = torch.tensor(int_list, dtype=torch.int64,
+                                   device=torch.cuda.current_device())
+    return broadcast_tensor(size, torch.int64, tensor=long_tensor, rank=rank)
--- a/megatron/inference/batching.py
+++ b/megatron/inference/batching.py
@@ -13,16 +13,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Batching utilities."""
+"""Tokenization utilities."""
 import torch
 from megatron import get_tokenizer
+from .communication import broadcast_int_list, broadcast_tensor
-def tokenize_prompts_and_batch(prompts, tokens_to_generate):
+def tokenize_prompts(prompts=None, tokens_to_generate=None, rank=0):
+    """Tokenize prompts and make them avaiable on all ranks."""
+    # On all ranks set to None so we can pass them to functions
+    sizes_list = None
+    prompts_tokens_cuda_long_tensor = None
+    prompts_length_cuda_long_tensor = None
+    # On the specified rank, build the above.
+    if torch.distributed.get_rank() == rank:
+        assert prompts is not None
+        assert tokens_to_generate is not None
+        # Tensor of tokens padded and their unpadded length.
+        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
+            _tokenize_prompts_and_batch(prompts, tokens_to_generate)
+        # We need the sizes of these tensors for the boradcast
+        sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
+                      prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
+    # First, broadcast the sizes.
+    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
+    # Now that we have the sizes, we can boradcast the tokens
+    # and length tensors.
+    sizes = sizes_tensor.tolist()
+    prompts_tokens_cuda_long_tensor = broadcast_tensor(
+        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
+    prompts_length_cuda_long_tensor = broadcast_tensor(
+        sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
+        rank=rank)
+    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
+def _tokenize_prompts_and_batch(prompts, tokens_to_generate):
    """Given a set of prompts and number of tokens to generate:
        - tokenize prompts
        - set the sequence length to be the max of length of prompts