Support tensor parallel (#2)

2f49f155 · Zhuohan Li · GitHub · cfae35b8 · 2f49f155 · 2f49f155
Unverified Commit 2f49f155 authored Mar 22, 2023 by Zhuohan Li Committed by GitHub Mar 21, 2023
20 changed files
--- a/README.md
+++ b/README.md
@@ -11,5 +11,6 @@ pip install -e .
 ## Run

 ```bash
-python server.py
+ray start --head
+python server.py [--tensor-parallel-size <N>]
 ```
--- a/cacheflow/models/__init__.py
+++ b/cacheflow/models/__init__.py
 from cacheflow.models.input_metadata import InputMetadata
 from cacheflow.models.model_utils import get_memory_analyzer
 from cacheflow.models.model_utils import get_model
-from cacheflow.models.utils import set_seed


 __all__ = [
    'InputMetadata',
    'get_memory_analyzer',
    'get_model',
-    'set_seed',
 ]
--- a/cacheflow/models/attention.py
+++ b/cacheflow/models/attention.py
@@ -112,7 +112,7 @@ class OPTCacheFlowAttention(nn.Module):
                output[:num_prompt_tokens],
                query[:num_prompt_tokens],
                key[:num_prompt_tokens],
-                value[:num_prompt_tokens],               
+                value[:num_prompt_tokens],
                input_metadata.prompt_lens,
            )


--- a/cacheflow/models/input_metadata.py
+++ b/cacheflow/models/input_metadata.py
@@ -43,4 +43,8 @@ class InputMetadata:
                f'num_generation_tokens={self.num_generation_tokens}, '
                f'num_valid_tokens={self.num_valid_tokens}, '
                f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
-                f'max_context_len={self.max_context_len})')
+                f'max_context_len={self.max_context_len}), '
+                f'prompt_lens={self.prompt_lens}, '
+                f'slot_mapping={self.slot_mapping}, '
+                f'context_lens={self.context_lens}, '
+                f'block_tables={self.block_tables})')
--- a/cacheflow/models/memory_analyzer.py
+++ b/cacheflow/models/memory_analyzer.py
@@ -31,12 +31,13 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
        model_name: str,
        block_size: int,
        dtype: torch.dtype,
+        tensor_parallel_size: int,
    ) -> None:
        self.model_name = model_name
        self.block_size = block_size
        self.dtype = dtype
+        self.tensor_parallel_size = tensor_parallel_size

-        # TODO(woosuk): Support tensor parallelism.
        config = AutoConfig.from_pretrained(model_name)
        self.num_layers = config.num_hidden_layers
        self.hidden_size = config.hidden_size
@@ -48,26 +49,25 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
        self.max_position = config.max_position_embeddings

    def _get_param_size(self) -> int:
-        # TODO(woosuk): Support tensor parallelism.
-        word_embedding = self.vocab_size * self.embedding_size
+        word_embedding = self.vocab_size * self.embedding_size // self.tensor_parallel_size
        if self.embedding_size != self.vocab_size:
            # Project in/out.
            word_embedding += 2 * self.embedding_size * self.vocab_size
        position_embedding = self.max_position * self.hidden_size

        ln1 = 2 * self.hidden_size
-        q = self.hidden_size * self.hidden_size + self.hidden_size
-        k = self.hidden_size * self.hidden_size + self.hidden_size
-        v = self.hidden_size * self.hidden_size + self.hidden_size
-        out = self.hidden_size * self.hidden_size + self.hidden_size
+        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
        mha = ln1 + q + k + v + out

        ln2 = 2 * self.hidden_size
-        ffn1 = self.hidden_size * self.ffn_size + self.ffn_size
-        ffn2 = self.ffn_size * self.hidden_size + self.hidden_size
+        ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
+        ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
        ffn = ln2 + ffn1 + ffn2

-        total = (word_embedding + position_embedding + 
+        total = (word_embedding + position_embedding +
                 self.num_layers * (mha + ffn))
        dtype_size = get_dtype_size(self.dtype)
        return dtype_size * total
@@ -76,15 +76,17 @@ class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
        self,
        max_num_batched_tokens: int,
    ) -> int:
-        # TODO(woosuk): Support tensor parallelism.
        # NOTE: We approxmiately calculate the maximum activation size by
-        # 1) estimating the maximum activation tensor size during inference, and
-        # 2) multiplying it by 4.
+        # estimating
+        # 1) the maximum activation tensor size during inference
+        # 2) the residual tensor size during inference
        # Here, we assume that FlashAttention is used and
        # thus the attention maps are never materialized in GPU DRAM.
-        qkv = 3 * (max_num_batched_tokens * self.hidden_size)
-        ffn = max_num_batched_tokens * self.ffn_size
-        max_act = 4 * max(qkv, ffn)
+        residual = max_num_batched_tokens * self.hidden_size
+        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
+        ffn = max_num_batched_tokens * self.ffn_size // self.tensor_parallel_size
+        # Double the activation size for input and output.
+        max_act = 2 * (max(qkv, ffn) + residual)
        dtype_size = get_dtype_size(self.dtype)
        return dtype_size * max_act


--- a/cacheflow/models/model_utils.py
+++ b/cacheflow/models/model_utils.py
 from typing import Union

+import numpy as np
 import torch
 import torch.nn as nn
+from transformers import AutoConfig

 from cacheflow.models.memory_analyzer import CacheFlowMemoryAnalyzer
 from cacheflow.models.memory_analyzer import OPTMemoryAnalyzer
@@ -21,13 +23,20 @@ _MEMORY_ANALYZERS = {
 def get_model(
    model_name: str,
    dtype: Union[torch.dtype, str],
+    path: str,
 ) -> nn.Module:
    torch_dtype = get_torch_dtype(dtype)
-    for model_class, hf_model in _MODELS.items():
-        if model_class in model_name:
-            model = hf_model.from_pretrained(
-                model_name, torch_dtype=torch_dtype)
-            return model.eval()
+    torch.set_default_dtype(torch_dtype)
+    config = AutoConfig.from_pretrained(model_name)
+    for model_class_name, model_class in _MODELS.items():
+        if model_class_name in model_name:
+            # Download model weights if it's not cached.
+            weights_dir = model_class.download_weights(model_name, path=path)
+            # Create a model instance.
+            model = model_class(config)
+            # Load the weights from the cached or downloaded files.
+            model.load_weights(weights_dir)
+            return model.eval(), torch_dtype
    raise ValueError(f'Unsupported model name: {model_name}')


@@ -35,10 +44,11 @@ def get_memory_analyzer(
    model_name: str,
    block_size: int,
    dtype: Union[torch.dtype, str],
+    tensor_parallel_size: int = 1,
 ) -> CacheFlowMemoryAnalyzer:
    torch_dtype = get_torch_dtype(dtype)
    for model_class, memory_analyzer in _MEMORY_ANALYZERS.items():
        if model_class in model_name:
            return memory_analyzer(
-                model_name, block_size, torch_dtype)
+                model_name, block_size, torch_dtype, tensor_parallel_size)
    raise ValueError(f'Unsupported model name: {model_name}')
--- a/cacheflow/models/opt.py
+++ b/cacheflow/models/opt.py
 """1D OPT model compatible with HuggingFace weights."""
+import os
+import glob
+import filelock
+from tqdm import tqdm
 from typing import Dict, List, Optional, Tuple

+import numpy as np
 import torch
 from torch import nn
 from transformers import OPTConfig
-from transformers import PreTrainedModel
+from huggingface_hub import snapshot_download

 from cacheflow.models import InputMetadata
 from cacheflow.models.attention import OPTCacheFlowAttention
 from cacheflow.models.sample import Sampler
+from cacheflow.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from cacheflow.parallel_utils.tensor_parallel import (VocabParallelEmbedding,
+                                                      ColumnParallelLinear,
+                                                      RowParallelLinear)
 from cacheflow.sequence import SequenceOutputs

 KVCache = Tuple[torch.Tensor, torch.Tensor]
@@ -36,15 +46,26 @@ class OPTAttention(nn.Module):
    ) -> None:
        super().__init__()
        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        total_num_heads = num_heads
+        assert num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = embed_dim // total_num_heads
        self.scaling = self.head_dim**-0.5

        # TODO(woosuk): Fuse the three linear layers into one QKV linear layer.
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.k_proj = ColumnParallelLinear(embed_dim, embed_dim, bias=bias,
+                                           gather_output=False,
+                                           perform_initialization=False)
+        self.v_proj = ColumnParallelLinear(embed_dim, embed_dim, bias=bias,
+                                           gather_output=False,
+                                           perform_initialization=False)
+        self.q_proj = ColumnParallelLinear(embed_dim, embed_dim, bias=bias,
+                                           gather_output=False,
+                                           perform_initialization=False)
+        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
+                                          input_is_parallel=True,
+                                          perform_initialization=False)

        self.attn = OPTCacheFlowAttention(scale=self.scaling)

@@ -55,13 +76,13 @@ class OPTAttention(nn.Module):
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:
-        q = self.q_proj(hidden_states)
-        k = self.k_proj(hidden_states)
-        v = self.v_proj(hidden_states)
+        q, _ = self.q_proj(hidden_states)
+        k, _ = self.k_proj(hidden_states)
+        v, _ = self.v_proj(hidden_states)
        key_cache, value_cache = kv_cache
        attn_output = self.attn(
            q, k, v, key_cache, value_cache, input_metadata, cache_event)
-        output = self.out_proj(attn_output)
+        output, _ = self.out_proj(attn_output)
        return output


@@ -69,6 +90,7 @@ class OPTDecoderLayer(nn.Module):

    def __init__(self, config: OPTConfig):
        super().__init__()
+        self.config = config
        self.embed_dim = config.hidden_size
        self.self_attn = OPTAttention(
            embed_dim=self.embed_dim,
@@ -81,9 +103,16 @@ class OPTDecoderLayer(nn.Module):

        self.self_attn_layer_norm = nn.LayerNorm(
            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
-        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias)
-        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
+        self.fc1 = ColumnParallelLinear(self.embed_dim, config.ffn_dim,
+                                        bias=config.enable_bias,
+                                        gather_output=False,
+                                        perform_initialization=False)
+        self.fc2 = RowParallelLinear(config.ffn_dim, self.embed_dim,
+                                     bias=config.enable_bias,
+                                     input_is_parallel=True,
+                                     perform_initialization=False)
+        self.final_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)

    def forward(
        self,
@@ -112,9 +141,9 @@ class OPTDecoderLayer(nn.Module):
        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
        if self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.fc1(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
        hidden_states = residual + hidden_states
        # 350m applies layer norm AFTER attention
        if not self.do_layer_norm_before:
@@ -122,29 +151,23 @@ class OPTDecoderLayer(nn.Module):
        return hidden_states


-class OPTPreTrainedModel(PreTrainedModel):
-    config_class = OPTConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["OPTDecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-
-    def _init_weights(self, module) -> None:
-        del module  # unused
-        return
-
-
-class OPTDecoder(OPTPreTrainedModel):
+class OPTDecoder(nn.Module):

    def __init__(self, config: OPTConfig):
-        super().__init__(config)
+        super().__init__()
+        self.config = config
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.vocab_size = config.vocab_size

-        self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
-        self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.word_embed_proj_dim,
+                                                   perform_initialization=False)
+        # Positional embeddings are replicated (not sharded).
+        self.embed_positions = OPTLearnedPositionalEmbedding(
+            config.max_position_embeddings, config.hidden_size)

+        # Project out & in will be replicated if they exist.
        if config.word_embed_proj_dim != config.hidden_size:
            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
        else:
@@ -167,9 +190,6 @@ class OPTDecoder(OPTPreTrainedModel):

        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])

-        # Initialize weights and apply final processing
-        self.post_init()
-
    def forward(
        self,
        input_ids: torch.LongTensor,
@@ -200,13 +220,11 @@ class OPTDecoder(OPTPreTrainedModel):
        return hidden_states


-class OPTModel(OPTPreTrainedModel):
+class OPTModel(nn.Module):

    def __init__(self, config: OPTConfig):
-        super().__init__(config)
+        super().__init__()
        self.decoder = OPTDecoder(config)
-        # Initialize weights and apply final processing
-        self.post_init()

    def forward(
        self,
@@ -220,41 +238,17 @@ class OPTModel(OPTPreTrainedModel):
            input_ids, positions, kv_caches, input_metadata, cache_events)


-class OPTForCausalLM(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+class OPTForCausalLM(nn.Module):

    def __init__(self, config):
-        super().__init__(config)
+        super().__init__()
+        self.config = config
        self.model = OPTModel(config)
-        # the lm_head weight is automatically tied to the embed tokens weight
-        self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+        # TODO(zhuohan): create a new weight after implementing pipeline
+        #                parallelism
+        self.lm_head_weight = self.model.decoder.embed_tokens.weight
        self.sampler = Sampler()

-        # Initialize weights and apply final processing
-        self.post_init()
-
-    # NOTE(woosuk): While the following methods are not called in the model code,
-    # they may be internally used by the transformers library.
-    # For example, tie_weights() does not work without these methods.
-    # Thus, do not delete these methods.
-    def get_input_embeddings(self):
-        return self.model.decoder.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.decoder.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model.decoder = decoder
-
-    def get_decoder(self):
-        return self.model.decoder
-
    def forward(
        self,
        input_ids: torch.LongTensor,
@@ -266,5 +260,72 @@ class OPTForCausalLM(OPTPreTrainedModel):
        hidden_states = self.model(
            input_ids, positions, kv_caches, input_metadata, cache_events)
        next_tokens = self.sampler(
-            self.lm_head.weight, hidden_states, input_metadata)
+            self.lm_head_weight, hidden_states, input_metadata)
        return next_tokens
+
+    _column_parallel_weights = ["embed_tokens.weight",
+                                "q_proj.weight", "k_proj.weight",
+                                "v_proj.weight", "fc1.weight"]
+    _column_parallel_biases = ["q_proj.bias", "k_proj.bias",
+                                "v_proj.bias", "fc1.bias"]
+    _row_parallel_weights = ["out_proj.weight", "fc2.weight"]
+
+    def load_weights(self, weights_path: str):
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+        for name, param in state_dict.items():
+            if "lm_head_weight" in name:
+                continue
+            loaded_weight = torch.from_numpy(np.load(os.path.join(weights_path,
+                                                                  name)))
+            for p in (self._column_parallel_weights
+                      + self._column_parallel_biases):
+                if p in name:
+                    shard_size = param.shape[0]
+                    loaded_weight = loaded_weight[
+                        shard_size * tensor_model_parallel_rank
+                        :shard_size * (tensor_model_parallel_rank + 1)]
+                    break
+            for p in self._row_parallel_weights:
+                if p in name:
+                    shard_size = param.shape[1]
+                    loaded_weight = loaded_weight[
+                        :,
+                        shard_size * tensor_model_parallel_rank
+                        :shard_size * (tensor_model_parallel_rank + 1)]
+                    break
+
+            assert param.shape == loaded_weight.shape
+            param.data.copy_(loaded_weight)
+
+    @staticmethod
+    def download_weights(model_name: str, path: str):
+        path = os.path.join(path, f"{model_name}-np")
+        path = os.path.abspath(os.path.expanduser(path))
+        os.makedirs(path, exist_ok=True)
+        lock_path = os.path.join(path, "file_lock")
+        lock = filelock.FileLock(lock_path)
+
+        with lock:
+            test_weight_path = os.path.join(
+                path, "model.decoder.embed_positions.weight")
+            if os.path.exists(test_weight_path):
+                return path
+
+            folder = snapshot_download(model_name, allow_patterns="*.bin",
+                                       cache_dir=os.path.join(path, "cache"))
+            bin_files = glob.glob(os.path.join(folder, "*.bin"))
+
+            if "/" in model_name:
+                model_name = model_name.split("/")[1].lower()
+
+            for bin_file in tqdm(bin_files, desc="Convert format"):
+                state = torch.load(bin_file)
+                for name, param in tqdm(state.items(), leave=False):
+                    if name.startswith("decoder."):
+                        name = "model." + name
+                    param_path = os.path.join(path, name)
+                    with open(param_path, "wb") as f:
+                        np.save(f, param.cpu().detach().numpy())
+
+            return path
--- a/cacheflow/models/sample.py
+++ b/cacheflow/models/sample.py
@@ -6,7 +6,7 @@ import torch.nn as nn
 from cacheflow.models import InputMetadata
 from cacheflow.sampling_params import SamplingParams
 from cacheflow.sequence import SequenceOutputs
-
+from cacheflow.parallel_utils.tensor_parallel import gather_from_tensor_model_parallel_region

 class Sampler(nn.Module):

@@ -24,6 +24,7 @@ class Sampler(nn.Module):

        # Get the logits for the next tokens.
        logits = torch.matmul(hidden_states, embedding.t())
+        logits = gather_from_tensor_model_parallel_region(logits)

        # Apply temperature scaling.
        temperatures = _get_temperatures(input_metadata)

--- a/cacheflow/models/utils.py
+++ b/cacheflow/models/utils.py
@@ -27,14 +27,6 @@ def get_dtype_size(dtype: Union[torch.dtype, str]) -> int:
    return torch.tensor([], dtype=torch_dtype).element_size()


-def set_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-
-
 def get_gpu_memory(gpu: int = 0) -> int:
    return torch.cuda.get_device_properties(gpu).total_memory


--- a/cacheflow/parallel_utils/README.md
+++ b/cacheflow/parallel_utils/README.md
+The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.
\ No newline at end of file
--- a/cacheflow/parallel_utils/__init__.py
+++ b/cacheflow/parallel_utils/__init__.py
+import cacheflow.parallel_utils.parallel_state
+import cacheflow.parallel_utils.tensor_parallel
+import cacheflow.parallel_utils.utils
+
+# Alias parallel_state as mpu, its legacy name
+mpu = parallel_state
+
+__all__ = [
+    "parallel_state",
+    "tensor_parallel",
+    "utils",
+]
--- a/cacheflow/parallel_utils/parallel_state.py
+++ b/cacheflow/parallel_utils/parallel_state.py
--- a/cacheflow/parallel_utils/tensor_parallel/__init__.py
+++ b/cacheflow/parallel_utils/tensor_parallel/__init__.py
+from .layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+    set_tensor_model_parallel_attributes,
+    set_defaults_if_not_set_tensor_model_parallel_attributes,
+    copy_tensor_model_parallel_attributes,
+    param_is_not_tensor_parallel_duplicate,
+    linear_with_grad_accumulation_and_async_allreduce
+
+)
+
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+    scatter_to_sequence_parallel_region,
+)
+
+from .random import (
+    checkpoint,
+    get_cuda_rng_tracker,
+    model_parallel_cuda_manual_seed,
+)
+
+from .utils import (
+    split_tensor_along_last_dim,
+    split_tensor_into_1d_equal_chunks,
+    gather_split_1d_tensor,
+)
+
+__all__ = [
+    #layers.py
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    "VocabParallelEmbedding",
+    "set_tensor_model_parallel_attributes",
+    "set_defaults_if_not_set_tensor_model_parallel_attributes",
+    "copy_tensor_model_parallel_attributes",
+    "param_is_not_tensor_parallel_duplicate",
+    "linear_with_grad_accumulation_and_async_allreduce",
+    # mappings.py
+    "copy_to_tensor_model_parallel_region",
+    "gather_from_tensor_model_parallel_region",
+    "gather_from_sequence_parallel_region",
+#    "reduce_from_tensor_model_parallel_region",
+    "scatter_to_tensor_model_parallel_region",
+    "scatter_to_sequence_parallel_region",
+    # random.py
+    "checkpoint",
+    "get_cuda_rng_tracker",
+    "model_parallel_cuda_manual_seed",
+    # utils.py
+    "split_tensor_along_last_dim",
+    "split_tensor_into_1d_equal_chunks",
+    "gather_split_1d_tensor",
+]
--- a/cacheflow/parallel_utils/tensor_parallel/layers.py
+++ b/cacheflow/parallel_utils/tensor_parallel/layers.py
--- a/cacheflow/parallel_utils/tensor_parallel/mappings.py
+++ b/cacheflow/parallel_utils/tensor_parallel/mappings.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from cacheflow.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_group,
+)
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size()==1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+
+    return input_
+
+
+def _split_along_last_dim(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert dim_size % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+
+    return output
+
+
+def _gather_along_last_dim(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(),
+                                       group=get_tensor_model_parallel_group())
+
+    return output
+
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert dim_size[0] % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+
+    dim_size[0] = dim_size[0] // world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(),
+                                           group=get_tensor_model_parallel_group())
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_last_dim(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split_along_last_dim(grad_output)
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_, tensor_parallel_output_grad=True):
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_, tensor_parallel_output_grad=True):
+        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
+
+        # If the computation graph after the gather operation is
+        # in the tensor parallel mode, output gradients need to reduce
+        # scattered and whereas if the computation is duplicated,
+        # output gradients need to be scattered.
+        if tensor_parallel_output_grad:
+            return _reduce_scatter_along_first_dim(grad_output), None
+        else:
+            return _split_along_first_dim(grad_output), None
+
+
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+
+
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
+
--- a/cacheflow/parallel_utils/tensor_parallel/random.py
+++ b/cacheflow/parallel_utils/tensor_parallel/random.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from cacheflow.parallel_utils.parallel_state import (
+    get_data_parallel_rank,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import (
+    split_tensor_into_1d_equal_chunks,
+    gather_split_1d_tensor,
+)
+
+from cacheflow.parallel_utils.utils import safely_set_viewless_tensor_data
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, distribute_saved_activations, *args):
+        ctx.run_function = run_function
+        ctx.distribute_saved_activations \
+            = distribute_saved_activations
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if distribute_saved_activations:
+            ctx.input_0_shape = args[0].data.shape
+            safely_set_viewless_tensor_data(
+                args[0],
+                split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if ctx.distribute_saved_activations:
+            safely_set_viewless_tensor_data(
+                inputs[0],
+                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                      for inp in detached_inputs)
+        return (None, None) + grads
+
+
+def checkpoint(function, distribute_saved_activations, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function,
+                                    distribute_saved_activations, *args)
--- a/cacheflow/parallel_utils/tensor_parallel/utils.py
+++ b/cacheflow/parallel_utils/tensor_parallel/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from typing import List, Sequence
+
+from cacheflow.parallel_utils.utils import divide
+from cacheflow.parallel_utils import parallel_state
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
+    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
+
+        Returns a Tensor or View with this rank's portion of the data.
+
+        Arguments:
+            tensor: The tensor to split
+
+        Keyword Arguments:
+            new_buffer (bool): If True, returns a new Tensor.
+                               If False, returns a view into the existing Tensor.
+                               Default is False
+
+    """
+    partition_size = torch.numel(tensor) // \
+        parallel_state.get_tensor_model_parallel_world_size()
+    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    if new_buffer:
+        data = torch.empty(partition_size, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+        data.copy_(tensor.view(-1)[start_index:end_index])
+    else:
+        data = tensor.view(-1)[start_index:end_index]
+    return data
+
+
+def gather_split_1d_tensor(tensor):
+    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
+        model parallel ranks.
+
+        Returns a new Tensor with the gathered data.
+
+        Arguments:
+            tensor: A Tensor or view of this rank's portion of the data.
+    """
+    numel_gathered = torch.numel(tensor) * \
+        parallel_state.get_tensor_model_parallel_world_size()
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
+    # this might break in future pytorch releases. We chose this API
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
+    torch.distributed._all_gather_base(gathered, tensor,
+                                       group=parallel_state.get_tensor_model_parallel_group())
+    return gathered
+
+
+class VocabUtility:
+    """ Split the vocabulary into `world_size` chunks and return the first
+        and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)
+
+    """
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int, rank, world_size: int
+    ) -> Sequence[int]:
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )
--- a/cacheflow/parallel_utils/utils.py
+++ b/cacheflow/parallel_utils/utils.py
+"""Utility functions used throughout Megatron core"""
+from functools import reduce
+import operator
+
+import torch
+
+from cacheflow.parallel_utils import parallel_state
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+class GlobalMemoryBuffer:
+    """Global buffer to avoid dynamic memory allocations.
+    Caller should ensure that buffers of the same name
+    are not used concurrently."""
+
+    def __init__(self):
+        self.buffer = {}
+
+    def get_tensor(self, tensor_shape, dtype, name):
+        required_len = reduce(operator.mul, tensor_shape, 1)
+        if self.buffer.get((name, dtype), None) is None or \
+                self.buffer[(name, dtype)].numel() < required_len:
+            self.buffer[(name, dtype)] = \
+                torch.empty(required_len,
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                            requires_grad=False)
+
+        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    '''Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    '''
+    out = torch.empty(
+        (1,),
+        dtype = inp.dtype,
+        device = inp.device,
+        requires_grad = requires_grad,
+    )
+    out.data = inp.data
+    return out
+
+class MakeViewlessTensor(torch.autograd.Function):
+    '''
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    '''
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    '''
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    '''
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+def assert_viewless_tensor(tensor, extra_msg = None):
+    '''Assert that a tensor is not a view (i.e., its '._base' field is
+    not set).'''
+    if isinstance(tensor, list):
+        [ assert_viewless_tensor(t) for t in tensor ]
+        return tensor
+    if not isinstance(tensor, torch.Tensor):
+        return tensor
+    assert tensor._base is None, (
+        "Ensure tensor._base is None before setting tensor.data or storing "
+        "tensor to memory buffer. Otherwise, a memory leak will occur (and "
+        "likely accumulate over iterations). %s"
+    ) % extra_msg
+    return tensor
+
+def safely_set_viewless_tensor_data(tensor, new_data_tensor):
+    '''Safely set tensor's '.data' field.
+
+    Check first that the tensor is viewless (i.e., '._base' not set). If not,
+    raise an exception.
+    '''
+    assert_viewless_tensor(tensor, extra_msg = "FYI, tensor._base has shape %s, and new_data_tensor has shape %s." % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape))
+    tensor.data = new_data_tensor
--- a/cacheflow/sequence.py
+++ b/cacheflow/sequence.py
@@ -158,3 +158,9 @@ class SequenceOutputs:
                f'parent_seq_id={self.parent_seq_id}, '
                f'output_token={self.output_token}), '
                f'logprobs={self.logprobs}')
+
+    def __eq__(self, other: 'SequenceOutputs') -> bool:
+        return (self.seq_id == other.seq_id and
+                self.parent_seq_id == other.parent_seq_id and
+                self.output_token == other.output_token and
+                self.logprobs == other.logprobs)
--- a/cacheflow/utils.py
+++ b/cacheflow/utils.py
 import enum
+import random
+
+import numpy as np
+import torch
+
+from cacheflow.parallel_utils.parallel_state import model_parallel_is_initialized
+from cacheflow.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed


 class Device(enum.Enum):
@@ -18,3 +25,13 @@ class Counter:

    def reset(self) -> None:
        self.counter = 0
+
+def set_random_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+    if model_parallel_is_initialized():
+        model_parallel_cuda_manual_seed(seed)