Initial commit

f87b35b2 · jerrrrry · f87b35b2 · f87b35b2 · f87b35b2 · f87b35b2
Commit f87b35b2 authored Apr 17, 2025 by jerrrrry
20 changed files
--- a/tests/utility/test_tensor_dict_utilities.py
+++ b/tests/utility/test_tensor_dict_utilities.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import pytest
+import torch
+from tensordict import TensorDict
+
+from verl.protocol import union_tensor_dict, union_numpy_dict
+
+from verl import DataProto
+import numpy as np
+
+
+def test_union_tensor_dict():
+    obs = torch.randn(100, 10)
+
+    data1 = TensorDict({'obs': obs, 'act': torch.randn(100, 3)}, batch_size=[100])
+    data2 = TensorDict({'obs': obs, 'next_obs': torch.randn(100, 10), 'rew': torch.randn(100)}, batch_size=[100])
+
+    data_with_copied_obs = TensorDict({
+        'obs': obs.clone(),
+        'next_obs': torch.randn(100, 10),
+        'rew': torch.randn(100)
+    },
+                                      batch_size=[100])
+
+    data = union_tensor_dict(data1, data2)
+    with pytest.raises(AssertionError):
+        data = union_tensor_dict(data1, data_with_copied_obs)
+
+    data = np.random.random(100)
+    data2 = [float('nan') for _ in range(99)]
+    data2.append('nan')
+    data2 = np.array(data2, dtype=object)
+    data3 = np.tile(data2, (2, 1))
+    a = {'a': data, 'b': data2, 'c': data3}
+    b = {'a': data, 'b': data2, 'c': data3}
+    b_ = {'a': np.random.random(100)}
+    union_numpy_dict(a, b)
+    with pytest.raises(AssertionError):
+        union_numpy_dict(a, b_)
+
+
+def test_tensor_dict_constructor():
+    obs = torch.randn(100, 10)
+    act = torch.randn(100, 10, 3)
+    data = DataProto.from_dict(tensors={'obs': obs, 'act': act})
+
+    assert data.batch.batch_size == torch.Size([100])
+
+    with pytest.raises(AssertionError):
+        data = DataProto.from_dict(tensors={'obs': obs, 'act': act}, num_batch_dims=2)
+
+    with pytest.raises(AssertionError):
+        data = DataProto.from_dict(tensors={'obs': obs, 'act': act}, num_batch_dims=3)
+
+
+def test_tensor_dict_make_iterator():
+    obs = torch.randn(100, 10)
+    labels = [random.choice(['abc', 'cde']) for _ in range(100)]
+    dataset = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels})
+
+    data_iter_1 = dataset.make_iterator(mini_batch_size=10, epochs=2, seed=1)
+    data_list_1 = []
+    for data in data_iter_1:
+        data_list_1.append(data)
+
+    data_iter_2 = dataset.make_iterator(mini_batch_size=10, epochs=2, seed=1)
+    data_list_2 = []
+    for data in data_iter_2:
+        data_list_2.append(data)
+
+    for data1, data2 in zip(data_list_1, data_list_2):
+        assert isinstance(data1, DataProto)
+        assert isinstance(data2, DataProto)
+        result = torch.all(torch.eq(data1.batch['obs'], data2.batch['obs']))
+        if not result.item():
+            print(data1.batch['obs'])
+            print(data2.batch['obs'])
+            assert False
+        non_tensor_result = np.all(np.equal(data1.non_tensor_batch['labels'], data2.non_tensor_batch['labels']))
+        if not non_tensor_result.item():
+            print(data1.non_tensor_batch['labels'])
+            print(data2.non_tensor_batch['labels'])
+
+
+def test_reorder():
+    obs = torch.tensor([1, 2, 3, 4, 5, 6])
+    labels = ['a', 'b', 'c', 'd', 'e', 'f']
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'name': 'abdce'})
+    data.reorder(torch.tensor([3, 4, 2, 0, 1, 5]))
+
+    assert torch.all(torch.eq(data.batch['obs'], torch.tensor([4, 5, 3, 1, 2, 6])))
+    assert np.all(data.non_tensor_batch['labels'] == np.array(['d', 'e', 'c', 'a', 'b', 'f']))
+    assert data.meta_info == {'name': 'abdce'}
+
+
+def test_chunk_concat():
+    obs = torch.tensor([1, 2, 3, 4, 5, 6])
+    labels = ['a', 'b', 'c', 'd', 'e', 'f']
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'name': 'abdce'})
+
+    with pytest.raises(AssertionError):
+        data.chunk(5)
+
+    data_split = data.chunk(2)
+    assert len(data_split) == 2
+    assert torch.all(torch.eq(data_split[0].batch['obs'], torch.tensor([1, 2, 3])))
+    assert np.all(data_split[0].non_tensor_batch['labels'] == np.array(['a', 'b', 'c']))
+    assert data_split[0].meta_info == {'name': 'abdce'}
+
+    assert torch.all(torch.eq(data_split[1].batch['obs'], torch.tensor([4, 5, 6])))
+    assert np.all(data_split[1].non_tensor_batch['labels'] == np.array(['d', 'e', 'f']))
+    assert data_split[1].meta_info == {'name': 'abdce'}
+
+    concat_data = DataProto.concat(data_split)
+    assert torch.all(torch.eq(concat_data.batch['obs'], data.batch['obs']))
+    assert np.all(concat_data.non_tensor_batch['labels'] == data.non_tensor_batch['labels'])
+    assert concat_data.meta_info == data.meta_info
+
+
+def test_pop():
+    obs = torch.randn(100, 10)
+    act = torch.randn(100, 3)
+    dataset = DataProto.from_dict({'obs': obs, 'act': act}, meta_info={'2': 2, '1': 1})
+    poped_dataset = dataset.pop(batch_keys=['obs'], meta_info_keys=['2'])
+
+    assert poped_dataset.batch.keys() == {'obs'}
+    assert poped_dataset.meta_info.keys() == {'2'}
+
+    assert dataset.batch.keys() == {'act'}
+    assert dataset.meta_info.keys() == {'1'}
+
+
+def test_repeat():
+    # Create a DataProto object with some batch and non-tensor data
+    obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    labels = ['a', 'b', 'c']
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
+
+    # Test interleave=True
+    repeated_data_interleave = data.repeat(repeat_times=2, interleave=True)
+    expected_obs_interleave = torch.tensor([[1, 2], [1, 2], [3, 4], [3, 4], [5, 6], [5, 6]])
+    expected_labels_interleave = ['a', 'a', 'b', 'b', 'c', 'c']
+
+    assert torch.all(torch.eq(repeated_data_interleave.batch['obs'], expected_obs_interleave))
+    assert (repeated_data_interleave.non_tensor_batch['labels'] == expected_labels_interleave).all()
+    assert repeated_data_interleave.meta_info == {'info': 'test_info'}
+
+    # Test interleave=False
+    repeated_data_no_interleave = data.repeat(repeat_times=2, interleave=False)
+    expected_obs_no_interleave = torch.tensor([[1, 2], [3, 4], [5, 6], [1, 2], [3, 4], [5, 6]])
+    expected_labels_no_interleave = ['a', 'b', 'c', 'a', 'b', 'c']
+
+    assert torch.all(torch.eq(repeated_data_no_interleave.batch['obs'], expected_obs_no_interleave))
+    assert (repeated_data_no_interleave.non_tensor_batch['labels'] == expected_labels_no_interleave).all()
+    assert repeated_data_no_interleave.meta_info == {'info': 'test_info'}
+
+
+def test_dataproto_pad_unpad():
+    obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    labels = ['a', 'b', 'c']
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
+
+    from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+
+    padded_data, pad_size = pad_dataproto_to_divisor(data, size_divisor=2)
+    assert pad_size == 1
+
+    expected_obs = torch.tensor([[1, 2], [3, 4], [5, 6], [1, 2]])
+    expected_labels = ['a', 'b', 'c', 'a']
+
+    assert torch.all(torch.eq(padded_data.batch['obs'], expected_obs))
+    assert (padded_data.non_tensor_batch['labels'] == expected_labels).all()
+    assert padded_data.meta_info == {'info': 'test_info'}
+
+    unpadd_data = unpad_dataproto(padded_data, pad_size=pad_size)
+    assert torch.all(torch.eq(unpadd_data.batch['obs'], obs))
+    assert (unpadd_data.non_tensor_batch['labels'] == labels).all()
+    assert unpadd_data.meta_info == {'info': 'test_info'}
+
+    padded_data, pad_size = pad_dataproto_to_divisor(data, size_divisor=3)
+    assert pad_size == 0
+
+    expected_obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    expected_labels = ['a', 'b', 'c']
+
+    assert torch.all(torch.eq(padded_data.batch['obs'], expected_obs))
+    assert (padded_data.non_tensor_batch['labels'] == expected_labels).all()
+    assert padded_data.meta_info == {'info': 'test_info'}
+
+    unpadd_data = unpad_dataproto(padded_data, pad_size=pad_size)
+    assert torch.all(torch.eq(unpadd_data.batch['obs'], obs))
+    assert (unpadd_data.non_tensor_batch['labels'] == labels).all()
+    assert unpadd_data.meta_info == {'info': 'test_info'}
+
+    padded_data, pad_size = pad_dataproto_to_divisor(data, size_divisor=7)
+    assert pad_size == 4
+
+    expected_obs = torch.tensor([[1, 2], [3, 4], [5, 6], [1, 2], [3, 4], [5, 6], [1, 2]])
+    expected_labels = ['a', 'b', 'c', 'a', 'b', 'c', 'a']
+    assert torch.all(torch.eq(padded_data.batch['obs'], expected_obs))
+    assert (padded_data.non_tensor_batch['labels'] == expected_labels).all()
+    assert padded_data.meta_info == {'info': 'test_info'}
+
+    unpadd_data = unpad_dataproto(padded_data, pad_size=pad_size)
+    assert torch.all(torch.eq(unpadd_data.batch['obs'], obs))
+    assert (unpadd_data.non_tensor_batch['labels'] == labels).all()
+    assert unpadd_data.meta_info == {'info': 'test_info'}
+
+
+def test_dataproto_fold_unfold():
+    from verl.protocol import fold_batch_dim, unfold_batch_dim, DataProto
+
+    obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    labels = ['a', 'b', 'c']
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
+
+    data1 = data.repeat(repeat_times=2, interleave=True)
+
+    data2 = fold_batch_dim(data1, new_batch_size=3)
+
+    torch.testing.assert_close(data2.batch['obs'], torch.tensor([[[1, 2], [1, 2]], [[3, 4], [3, 4]], [[5, 6], [5, 6]]]))
+    assert (data2.non_tensor_batch['labels'] == [['a', 'a'], ['b', 'b'], ['c', 'c']]).all()
+
+    data2.reorder(indices=torch.tensor([1, 2, 0]))
+
+    data3 = unfold_batch_dim(data2, batch_dims=2)
+
+    torch.testing.assert_close(data3.batch['obs'], torch.tensor([[3, 4], [3, 4], [5, 6], [5, 6], [1, 2], [1, 2]]))
+    assert (data3.non_tensor_batch['labels'] == ['b', 'b', 'c', 'c', 'a', 'a']).all()
+    assert data3.meta_info == {'info': 'test_info'}
+
+
+def test_torch_save_data_proto():
+
+    obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    labels = ['a', 'b', 'c']
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
+    data.save_to_disk('test_data.pt')
+    loaded_data = DataProto.load_from_disk('test_data.pt')
+
+    assert torch.all(torch.eq(loaded_data.batch['obs'], data.batch['obs']))
+    assert (loaded_data.non_tensor_batch['labels'] == data.non_tensor_batch['labels']).all()
+    assert loaded_data.meta_info == data.meta_info
+
+    import os
+    os.remove('test_data.pt')
+
+
+def test_len():
+    obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    labels = np.array(['a', 'b', 'c'], dtype=object)
+    data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
+
+    assert len(data) == 3
+
+    data = DataProto(batch=None, non_tensor_batch={'labels': labels}, meta_info={'info': 'test_info'})
+
+    assert len(data) == 3
+
+    data = DataProto(batch=None, non_tensor_batch={}, meta_info={'info': 'test_info'})
+
+    assert len(data) == 0
+
+    data = DataProto(batch=None, non_tensor_batch=None, meta_info={'info': 'test_info'})
+
+    assert len(data) == 0
+
+
+def test_seqlen_balancing():
+    from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx
+    input_ids = torch.randint(low=0, high=10, size=(20, 100))
+    from verl.utils.model import create_random_mask
+    attention_mask = create_random_mask(input_ids=input_ids,
+                                        max_ratio_of_left_padding=0.1,
+                                        max_ratio_of_valid_token=0.9,
+                                        min_ratio_of_valid_token=0.5)
+    data = {'input_ids': input_ids, 'attention_mask': attention_mask}
+    dataproto = DataProto.from_single_dict(data)
+    micro_batches, micro_bsz_idx_lst = rearrange_micro_batches(dataproto.batch, max_token_len=300)
+    batch = torch.cat(micro_batches)
+    micro_bsz_idx = []
+    for idx in micro_bsz_idx_lst:
+        micro_bsz_idx.extend(idx)
+    reverse_idx_map = get_reverse_idx(micro_bsz_idx)
+    reverse_idx_map = torch.tensor(reverse_idx_map)
+    new_batch = batch[reverse_idx_map]
+    torch.testing.assert_close(new_batch, dataproto.batch)
\ No newline at end of file
--- a/tests/verl/utils/dataset/test_multiturn_sft_dataset.py
+++ b/tests/verl/utils/dataset/test_multiturn_sft_dataset.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test the MultiTurnSFTDataset implementation
+"""
+import os
+import pandas as pd
+import torch
+from transformers import AutoTokenizer
+from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
+
+
+def test_multiturn_sft_dataset():
+    print("Starting test...")
+    # Create a temporary parquet file with test data
+    test_data = {
+        'messages': [[{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "What is 2+2?"
+        }, {
+            "role": "assistant",
+            "content": "2+2 equals 4."
+        }, {
+            "role": "user",
+            "content": "And what is 4+4?"
+        }, {
+            "role": "assistant",
+            "content": "4+4 equals 8."
+        }],
+                     [{
+                         "role": "system",
+                         "content": "You are a helpful assistant."
+                     }, {
+                         "role": "user",
+                         "content": "Tell me a joke."
+                     }, {
+                         "role": "assistant",
+                         "content": "Why did the chicken cross the road?"
+                     }, {
+                         "role": "user",
+                         "content": "Why?"
+                     }, {
+                         "role": "assistant",
+                         "content": "To get to the other side!"
+                     }]]
+    }
+
+    # Create test directory if it doesn't exist
+    os.makedirs('test_data', exist_ok=True)
+    test_file = 'test_data/test.parquet'
+
+    # Save test data to parquet
+    df = pd.DataFrame(test_data)
+    df.to_parquet(test_file)
+
+    # Initialize tokenizer and dataset
+    tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-7B-Instruct')
+    config = {'max_length': 512, 'truncation': 'error', 'multiturn': {'messages_key': 'messages'}}
+    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config)
+
+    # Test 1: Dataset Length
+    assert len(dataset) == 2, f"Expected dataset length 2, got {len(dataset)}"
+
+    # Get items for testing
+    item0 = dataset[0]  # Math conversation
+    item1 = dataset[1]  # Joke conversation
+
+    # Test 2: Required Keys and Types
+    required_keys = ['input_ids', 'attention_mask', 'position_ids', 'loss_mask']
+    for key in required_keys:
+        assert key in item0, f"Missing key {key} in dataset item"
+        assert isinstance(item0[key], torch.Tensor), f"Expected torch.Tensor for {key}"
+        assert item0[key].dtype == torch.long, f"Expected torch.long for {key}, got {item0[key].dtype}"
+
+    # Test 3: Shape Consistency
+    assert item0['loss_mask'].shape == item0['input_ids'].shape, \
+        "Loss mask shape doesn't match input_ids shape"
+    assert item0['attention_mask'].shape == item0['input_ids'].shape, \
+        "Attention mask shape doesn't match input_ids shape"
+    assert item0['position_ids'].shape == item0['input_ids'].shape, \
+        "Position IDs shape doesn't match input_ids shape"
+
+    # Test 4: Loss Mask Pattern - Math Conversation
+    loss_mask0 = item0['loss_mask']
+    input_ids0 = item0['input_ids']
+
+    # Find assistant response positions
+    assistant_positions0 = torch.where(loss_mask0 == 1)[0]
+    assert len(assistant_positions0) > 0, "No assistant positions found in loss mask"
+
+    # Decode and verify assistant responses
+    assistant_text0 = tokenizer.decode(input_ids0[loss_mask0 == 1])
+    print(f"Math conversation assistant text: {assistant_text0}")
+    assert "2+2 equals 4" in assistant_text0, "First assistant response not found"
+    assert "4+4 equals 8" in assistant_text0, "Second assistant response not found"
+
+    # Test 5: Loss Mask Pattern - Joke Conversation
+    loss_mask1 = item1['loss_mask']
+    input_ids1 = item1['input_ids']
+
+    # Find assistant response positions
+    assistant_positions1 = torch.where(loss_mask1 == 1)[0]
+    assert len(assistant_positions1) > 0, "No assistant positions found in loss mask"
+
+    # Decode and verify assistant responses
+    assistant_text1 = tokenizer.decode(input_ids1[loss_mask1 == 1])
+    print(f"Joke conversation assistant text: {assistant_text1}")
+    assert "chicken cross the road" in assistant_text1, "First assistant response not found"
+    assert "other side" in assistant_text1, "Second assistant response not found"
+
+    # Test 6: Attention Mask Pattern
+    attention_mask0 = item0['attention_mask']
+    sequence_length = torch.sum(attention_mask0)
+    assert sequence_length > 0, "No tokens marked as attended in attention mask"
+    assert torch.all(attention_mask0[:sequence_length] == 1), "Incorrect attention mask pattern"
+    if sequence_length < len(attention_mask0):
+        assert torch.all(attention_mask0[sequence_length:] == 0), "Padding not properly masked"
+
+    # Test 7: Position IDs Pattern
+    position_ids0 = item0['position_ids']
+    assert torch.equal(position_ids0[:sequence_length], torch.arange(sequence_length)), \
+        "Position IDs not sequential for non-padded tokens"
+    if sequence_length < len(position_ids0):
+        assert torch.all(position_ids0[sequence_length:] == 0), "Padding position IDs not zero"
+
+    # Test 8: Verify loss mask for assistant responses
+    # Get the full conversation text
+    full_text = tokenizer.decode(input_ids0)
+    print(f"\nFull conversation text:\n{full_text}")
+
+    # Get the assistant responses
+    assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 1])
+    print(f"\nAssistant responses (from loss mask):\n{assistant_text}")
+
+    # Verify that loss mask is set for all assistant responses
+    for msg in test_data['messages'][0]:  # First conversation
+        if msg['role'] == 'assistant':
+            # The content should appear in the masked text
+            assert msg['content'] in assistant_text, \
+                f"Assistant message '{msg['content']}' not found in masked text"
+
+            # The content should NOT appear in the non-masked text
+            non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
+            assert msg['content'] not in non_assistant_text, \
+                f"Assistant message '{msg['content']}' found in non-assistant text"
+
+    # Test 9: Verify non-assistant parts have loss_mask=0
+    # Get non-assistant text
+    non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
+    print(f"\nNon-assistant text (from loss mask):\n{non_assistant_text}")
+
+    # Verify that system and user messages are in the non-assistant text
+    for msg in test_data['messages'][0]:  # First conversation
+        if msg['role'] in ['system', 'user']:
+            assert msg['content'] in non_assistant_text, \
+                f"{msg['role'].title()} message '{msg['content']}' not found in non-assistant text"
+
+            # And verify they're NOT in the assistant text
+            assert msg['content'] not in assistant_text, \
+                f"{msg['role'].title()} message '{msg['content']}' found in assistant text"
+
+    # Test 10: Verify padding behavior
+    padding_config = {'max_length': 1024, 'truncation': 'error', 'multiturn': {'messages_key': 'messages'}}
+    small_dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=padding_config)
+    padded_item = small_dataset[0]
+
+    # Get actual sequence length (before padding)
+    actual_length = torch.sum(padded_item['attention_mask'])
+
+    # Verify padding tokens
+    assert torch.all(padded_item['input_ids'][actual_length:] == tokenizer.pad_token_id), \
+        "Padding tokens not set correctly"
+    assert torch.all(padded_item['attention_mask'][actual_length:] == 0), \
+        "Attention mask not set correctly for padding"
+    assert torch.all(padded_item['loss_mask'][actual_length:] == 0), \
+        "Loss mask not set correctly for padding"
+
+    print("All tests passed!")
+    print("Starting test...")
--- a/tests/verl/utils/dataset/test_rl_dataset.py
+++ b/tests/verl/utils/dataset/test_rl_dataset.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+from omegaconf import OmegaConf
+
+
+def get_gsm8k_data():
+    # prepare test dataset
+    url = "https://github.com/eric-haibin-lin/verl-data/raw/refs/heads/main/gsm8k/train.parquet"
+    local_folder = os.path.expanduser('~/verl-data/gsm8k/')
+    local_path = os.path.join(local_folder, 'train.parquet')
+    os.makedirs(local_folder, exist_ok=True)
+    return local_path
+
+
+def test_rl_dataset():
+    from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+    from verl.utils import hf_tokenizer
+    tokenizer = hf_tokenizer('deepseek-ai/deepseek-coder-1.3b-instruct')
+    local_path = get_gsm8k_data()
+    config = OmegaConf.create({
+        "prompt_key": "prompt",
+        "max_prompt_length": 256,
+        "filter_overlong_prompts": True,
+        "filter_overlong_prompts_workers": 2,
+    })
+    dataset = RLHFDataset(data_files=local_path, tokenizer=tokenizer, config=config)
+
+    dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True, drop_last=True, collate_fn=collate_fn)
+
+    a = next(iter(dataloader))
+
+    from verl import DataProto
+
+    tensors = {}
+    non_tensors = {}
+
+    for key, val in a.items():
+        if isinstance(val, torch.Tensor):
+            tensors[key] = val
+        else:
+            non_tensors[key] = val
+
+    data_proto = DataProto.from_dict(tensors=tensors, non_tensors=non_tensors)
+    assert 'input_ids' in data_proto.batch
+
+    data = dataset[0]['input_ids']
+    output = tokenizer.batch_decode([data])[0]
+    print(f'type: type{output}')
+    print(f'\n\noutput: {output}')
+
+
+def test_image_rl_data():
+    from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+    from verl.utils import hf_tokenizer, hf_processor
+    tokenizer = hf_tokenizer('Qwen/Qwen2-VL-2B-Instruct')
+    processor = hf_processor('Qwen/Qwen2-VL-2B-Instruct')
+    config = OmegaConf.create({
+        "prompt_key": "prompt",
+        "max_prompt_length": 1024,
+        "filter_overlong_prompts": True,
+        "filter_overlong_prompts_workers": 2,
+    })
+    dataset = RLHFDataset(data_files=os.path.expanduser("~/data/geo3k/train.parquet"),
+                          tokenizer=tokenizer,
+                          config=config,
+                          processor=processor)
+
+    dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True, drop_last=True, collate_fn=collate_fn)
+
+    a = next(iter(dataloader))
+
+    from verl import DataProto
+
+    tensors = {}
+    non_tensors = {}
+
+    for key, val in a.items():
+        if isinstance(val, torch.Tensor):
+            tensors[key] = val
+        else:
+            non_tensors[key] = val
+
+    data_proto = DataProto.from_dict(tensors=tensors, non_tensors=non_tensors)
+
+    assert 'multi_modal_data' in data_proto.non_tensor_batch
+    assert 'multi_modal_inputs' in data_proto.non_tensor_batch
+
+    data = dataset[0]['input_ids']
+    output = tokenizer.batch_decode([data])[0]
+    print(f'type: type{output}')
+    print(f'\n\noutput: {output}')
--- a/tests/verl/utils/dataset/test_rm_dataset.py
+++ b/tests/verl/utils/dataset/test_rm_dataset.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from transformers import AutoTokenizer
+from verl.utils import hf_tokenizer
+from verl.utils.dataset.rm_dataset import RMDataset
+
+
+def get_rm_data():
+    # prepare test dataset
+    url = "https://github.com/eric-haibin-lin/verl-data/raw/refs/heads/main/full_hh_rlhf/rm/test.parquet"
+    local_folder = os.path.expanduser('~/verl-data/full_hh_rlhf/rm/')
+    local_path = os.path.join(local_folder, 'test.parquet')
+    os.makedirs(local_folder, exist_ok=True)
+    return local_path
+
+
+def test_rm_dataset():
+    tokenizer = hf_tokenizer("facebook/opt-1.3b")
+    local_path = get_rm_data()
+    dataset = RMDataset(parquet_files=local_path, tokenizer=tokenizer, max_length=512)
+    data = dataset[0]['input_ids']
+    output = tokenizer.batch_decode(data)
+    assert len(output) > 1
+    assert type(output[0]) == str
--- a/tests/verl/utils/dataset/test_sft_dataset.py
+++ b/tests/verl/utils/dataset/test_sft_dataset.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from transformers import AutoTokenizer
+from verl.utils import hf_tokenizer
+from verl.utils.dataset.sft_dataset import SFTDataset
+
+
+def get_gsm8k_data():
+    # prepare test dataset
+    url = "https://github.com/eric-haibin-lin/verl-data/raw/refs/heads/main/gsm8k/train.parquet"
+    local_folder = os.path.expanduser('~/verl-data/gsm8k/')
+    local_path = os.path.join(local_folder, 'train.parquet')
+    return local_path
+
+
+def test_sft_cot_dataset():
+    tokenizer = hf_tokenizer('deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
+    local_path = get_gsm8k_data()
+    from omegaconf import OmegaConf
+    dataset = SFTDataset(parquet_files=local_path,
+                         tokenizer=tokenizer,
+                         config=OmegaConf.create({
+                             'prompt_key': 'prompt',
+                             'prompt_dict_keys': ['content'],
+                             'response_key': 'extra_info',
+                             'response_dict_keys': ['answer'],
+                             'max_length': 512,
+                         }))
+
+    data = dataset[0]['input_ids']
+    output = tokenizer.batch_decode([data])[0]
+    assert len(output) > 1
+    assert type(output) == str
+
+
+def test_sft_dataset():
+    tokenizer = hf_tokenizer('deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
+    local_path = get_gsm8k_data()
+    from omegaconf import OmegaConf
+    dataset = SFTDataset(parquet_files=local_path,
+                         tokenizer=tokenizer,
+                         config=OmegaConf.create({
+                             "prompt_key": 'extra_info',
+                             'prompt_dict_keys': ['question'],
+                             'response_key': 'extra_info',
+                             'response_dict_keys': ['answer'],
+                             'max_length': 512
+                         }))
+
+    data = dataset[0]['input_ids']
+    output = tokenizer.batch_decode([data])[0]
+    assert len(output) > 1
+    assert type(output) == str
--- a/tests/verl/utils/test_import_utils.py
+++ b/tests/verl/utils/test_import_utils.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import importlib.util
+import pytest
+from verl.utils.import_utils import load_extern_type
+
+# Path to the test module
+TEST_MODULE_PATH = os.path.join(os.path.dirname(__file__), "test_module.py")
+
+
+def test_load_extern_type_class():
+    """Test loading a class from an external file"""
+    TestClass = load_extern_type(TEST_MODULE_PATH, "TestClass")
+
+    # Verify the class was loaded correctly
+    assert TestClass is not None
+    assert TestClass.__name__ == "TestClass"
+
+    # Test instantiation and functionality
+    instance = TestClass()
+    assert instance.value == "default"
+
+    # Test with a custom value
+    custom_instance = TestClass("custom")
+    assert custom_instance.get_value() == "custom"
+
+
+def test_load_extern_type_function():
+    """Test loading a function from an external file"""
+    test_function = load_extern_type(TEST_MODULE_PATH, "test_function")
+
+    # Verify the function was loaded correctly
+    assert test_function is not None
+    assert callable(test_function)
+
+    # Test function execution
+    result = test_function()
+    assert result == "test_function_result"
+
+
+def test_load_extern_type_constant():
+    """Test loading a constant from an external file"""
+    constant = load_extern_type(TEST_MODULE_PATH, "TEST_CONSTANT")
+
+    # Verify the constant was loaded correctly
+    assert constant is not None
+    assert constant == "test_constant_value"
+
+
+def test_load_extern_type_nonexistent_file():
+    """Test behavior when file doesn't exist"""
+    with pytest.raises(FileNotFoundError):
+        load_extern_type("/nonexistent/path.py", "SomeType")
+
+
+def test_load_extern_type_nonexistent_type():
+    """Test behavior when type doesn't exist in the file"""
+    with pytest.raises(AttributeError):
+        load_extern_type(TEST_MODULE_PATH, "NonExistentType")
+
+
+def test_load_extern_type_none_path():
+    """Test behavior when file path is None"""
+    result = load_extern_type(None, "SomeType")
+    assert result is None
+
+
+def test_load_extern_type_invalid_module():
+    """Test behavior when module has syntax errors"""
+    # Create a temporary file with syntax errors
+    import tempfile
+
+    with tempfile.NamedTemporaryFile(suffix='.py', mode='w+', delete=False) as temp_file:
+        temp_file.write("This is not valid Python syntax :")
+        temp_path = temp_file.name
+
+    try:
+        with pytest.raises(RuntimeError):
+            load_extern_type(temp_path, "SomeType")
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
--- a/tests/verl/utils/test_module.py
+++ b/tests/verl/utils/test_module.py
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Test module for import_utils.load_extern_type testing
+class TestClass:
+    """A test class to be imported by load_extern_type"""
+
+    def __init__(self, value=None):
+        self.value = value or "default"
+
+    def get_value(self):
+        return self.value
+
+
+TEST_CONSTANT = "test_constant_value"
+
+
+def test_function():
+    return "test_function_result"
--- a/verl/__init__.py
+++ b/verl/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+
+with open(os.path.join(version_folder, 'version/version')) as f:
+    __version__ = f.read().strip()
+
+from .protocol import DataProto
+
+from .utils.logging_utils import set_basic_config
+import logging
+
+set_basic_config(level=logging.WARNING)
+
+from . import single_controller
+
+__all__ = ['DataProto', "__version__"]
+
+if os.getenv('VERL_USE_MODELSCOPE', 'False').lower() == 'true':
+    import importlib
+    if importlib.util.find_spec("modelscope") is None:
+        raise ImportError(f'You are using the modelscope hub, please install modelscope by `pip install modelscope -U`')
+    # Patch hub to download models from modelscope to speed up.
+    from modelscope.utils.hf_util import patch_hub
+    patch_hub()
--- a/verl/models/README.md
+++ b/verl/models/README.md
+# Models
+Common modelzoo such as huggingface/transformers stuggles when using Pytorch native model parallelism. Following the design principle of vLLM, we keep a simple, parallelizable, highly-optimized with packed inputs in verl. 
+## Adding a New Huggingface Model
+### Step 1: Copy the model file from HF to verl
+- Add a new file under verl/models/hf
+- Copy ONLY the model file from huggingface/transformers/models to verl/models/hf
+
+### Step 2: Modify the model file to use packed inputs
+- Remove all the code related to inference (kv cache)
+- Modify the inputs to include only
+    - input_ids (total_nnz,)
+    - cu_seqlens (total_nnz + 1,)
+    - max_seqlen_in_batch: int
+- Note that this requires using flash attention with causal mask.
+
+### Step 2.5: Add tests
+- Add a test to compare this version and the huggingface version
+- Following the infrastructure and add tests to tests/models/hf
+
+### Step 3: Add a function to apply tensor parallelism
+- Please follow
+    - https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+    - https://pytorch.org/tutorials/intermediate/TP_tutorial.html
+- General comments
+    - Tensor Parallelism in native Pytorch is NOT auto-parallelism. The way it works is to specify how model parameters and input/output reshards using configs. These configs are then registered as hooks to perform input/output resharding before/after model forward.
+
+### Step 4: Add a function to apply data parallelism
+- Please use FSDP2 APIs
+- See demo here https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py#L413
+
+### Step 5: Add a function to apply pipeline parallelism
+- Comes in Pytorch 2.4
+- Currently only in alpha in nightly version
+- Check torchtitan for more details
+
--- a/verl/models/__init__.py
+++ b/verl/models/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/verl/models/llama/__init__.py
+++ b/verl/models/llama/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/verl/models/llama/megatron/__init__.py
+++ b/verl/models/llama/megatron/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling_llama_megatron import (
+    # original model with megatron
+    ParallelLlamaModel,
+    ParallelLlamaForCausalLM,
+    # rmpad with megatron
+    ParallelLlamaForCausalLMRmPad,
+    ParallelLlamaForValueRmPad,
+    # rmpad with megatron and pipeline parallelism
+    ParallelLlamaForCausalLMRmPadPP,
+    ParallelLlamaForValueRmPadPP)
--- a/verl/models/llama/megatron/checkpoint_utils/__init__.py
+++ b/verl/models/llama/megatron/checkpoint_utils/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/verl/models/llama/megatron/checkpoint_utils/llama_loader.py
+++ b/verl/models/llama/megatron/checkpoint_utils/llama_loader.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+from packaging.version import Version
+import torch
+import time
+from typing import Dict, Any, Callable, Optional
+import torch.distributed as dist
+
+
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+
+    print(f'get megatron data parallel size: {mpu.get_data_parallel_world_size()}')
+
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
+                            pp_rank_idx * num_layers_per_model)
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+
+
+def load_state_dict_to_megatron_llama(state_dict,
+                                      wrapped_models,
+                                      config,
+                                      params_dtype,
+                                      is_value_model=False,
+                                      tie_word_embeddings=False):
+    """Load merged state_dict to sharded Megatron module in training.
+    """
+    from megatron.core import mpu
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    from megatron.core.transformer.module import Float16Module
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+
+    start_time = time.time()
+
+    def _get_gpt_model(model):
+        return model
+
+    def fetch_params(module):
+        for param in module.parameters():
+            torch.distributed.fetch(param.data,
+                                    src=mpu.get_data_parallel_src_rank(),
+                                    group=mpu.get_data_parallel_group())
+
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+
+    if torch.distributed.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f'num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}'
+
+    models = [None] * len(wrapped_models)
+
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.model.layers) == num_layers_per_model
+
+    def _fetch_tensor(tensor, name) -> torch.Tensor:
+        """fetch tensor"""
+        nonlocal state_dict
+        if tensor is not None:
+            tensor.data.copy_(state_dict[name])
+
+    def _fetch_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """fetch tensor in tp shards"""
+        nonlocal state_dict
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if name in state_dict:
+            full_weight = state_dict[name]
+
+            if mutate_func is not None:
+                full_weight = mutate_func(full_weight)
+            tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+            if tensor is not None:
+                tensor.data.copy_(tensor_chunk[tp_rank])
+        else:
+            print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+
+    def _fetch_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """fetch tensor in tp shards"""
+        nonlocal state_dict
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if name in state_dict:
+            full_weight = state_dict[name]
+
+            if mutate_func is not None:
+                full_weight = mutate_func(full_weight)
+            tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+            if tensor is not None:
+                tensor.data.copy_(tensor_chunk[tp_rank])
+        else:
+            print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+
+    def _fetch_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """fetch gate_up tensor in tp shards"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        if gate_name in state_dict and up_name in state_dict:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2,
+                                             config.hidden_size,
+                                             dtype=params_dtype,
+                                             device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)].copy_(
+                    torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            if tensor is not None:
+                tensor.data.copy_(tensor_chunk[tp_rank])
+        else:
+            print(f"tp_shard tensor:[{gate_name}, {up_name}] not in state_dict, skip loading")
+
+    def _fetch_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
+        """fetch tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        assert (q_name in state_dict and k_name in state_dict and v_name in state_dict)
+        full_weight_q = state_dict[q_name]
+        full_weight_k = state_dict[k_name]
+        full_weight_v = state_dict[v_name]
+
+        hidden_size_per_head = config.hidden_size // config.num_attention_heads
+
+        if config.num_key_value_heads >= tp_size:
+            q_size_tp = config.hidden_size // tp_size
+            kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+            total_size = q_size_tp + 2 * kv_size_tp
+            new_weight_qkv = torch.empty(total_size * tp_size,
+                                         config.hidden_size,
+                                         dtype=params_dtype,
+                                         device=torch.cuda.current_device())
+            for i in range(tp_size):
+                q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
+                k_part = full_weight_k[i * kv_size_tp:(i + 1) * kv_size_tp]
+                v_part = full_weight_v[i * kv_size_tp:(i + 1) * kv_size_tp]
+                new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+
+        else:
+            q_size_tp = config.hidden_size // tp_size
+            kv_size_tp = hidden_size_per_head
+            total_size = q_size_tp + 2 * kv_size_tp
+            new_weight_qkv = torch.empty(total_size * tp_size,
+                                         config.hidden_size,
+                                         dtype=params_dtype,
+                                         device=torch.cuda.current_device())
+            for i in range(tp_size):
+                q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
+                start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                k_part = full_weight_k[start_idx:end_idx]
+                v_part = full_weight_v[start_idx:end_idx]
+                new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
+
+        tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+        if tensor is not None:
+            tensor.data.copy_(tensor_chunk[tp_rank])
+
+    # Embeddings
+    # -------------------
+    print_rank_0("loading embeddings...")
+    gpt_model_module = _get_gpt_model(models[0])
+    embed_tokens_weight = None
+    if pp_rank == 0:
+        embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+    _fetch_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+
+    # Transformer layers
+    # -------------------
+    layer_map = _megatron_calc_layer_map(config)
+
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    num_layer_per_pp = config.num_hidden_layers // pp_size
+    vpp_size = mpu.get_virtual_pipeline_model_parallel_world_size()
+
+    layer_list = []
+    if vpp_size is not None:
+        for vpp_rank in range(vpp_size):
+            num_layer_vpp_chunk = num_layer_per_pp // vpp_size
+            num_layer_this_model = num_layer_vpp_chunk
+            offset = vpp_rank * (
+                    config.num_hidden_layers // mpu.get_virtual_pipeline_model_parallel_world_size()) + \
+                        (mpu.get_pipeline_model_parallel_rank() * num_layer_vpp_chunk)
+            layer_list.extend(list(range(offset, offset + num_layer_this_model)))
+    else:
+        num_layer_this_model = num_layer_per_pp
+        offset = pp_rank * num_layer_per_pp
+        layer_list.extend(list(range(offset, offset + num_layer_this_model)))
+
+    for layer in layer_list:
+        print_rank_0(f"loading layer #{layer}...")
+        layer_name = f"model.layers.{layer}"
+        dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+
+        gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+        sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+
+        _fetch_tensor(
+            sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.input_layernorm.weight",
+        )
+
+        _fetch_tp_shard_tensor_qkv(
+            sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.q_proj.weight",
+            f"{layer_name}.self_attn.k_proj.weight",
+            f"{layer_name}.self_attn.v_proj.weight",
+        )
+
+        _fetch_tp_shard_tensor(
+            sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.self_attn.o_proj.weight",
+            chunk_dim=1,
+        )
+
+        _fetch_tensor(
+            sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.post_attention_layernorm.weight",
+        )
+
+        _fetch_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+                                       f"{layer_name}.mlp.gate_proj.weight", f"{layer_name}.mlp.up_proj.weight")
+
+        _fetch_tp_shard_tensor(
+            sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+            f"{layer_name}.mlp.down_proj.weight",
+            chunk_dim=1,
+        )
+    # Final Layernorm
+    # -------------------
+    print_rank_0("loading final layernorm...")
+    gpt_model_module = _get_gpt_model(models[-1])
+    _fetch_tensor(
+        getattr(gpt_model_module.model.norm, "weight", None),
+        "model.norm.weight",
+    )
+
+    print_rank_0("loading lm_head...")
+    if pp_rank + 1 == pp_size:
+        lm_head_weight = gpt_model_module.lm_head.weight
+
+        if is_value_model:
+            if 'lm_head.weight' in state_dict and state_dict['lm_head.weight'].shape[0] == 1:
+                _fetch_tensor(lm_head_weight, "lm_head.weight")
+                print_rank_0('load lm_head weight')
+            elif 'reward_head.weight' in state_dict and state_dict['reward_head.weight'].shape[0] == 1:
+                _fetch_tensor(lm_head_weight, "reward_head.weight")
+                print_rank_0('load lm_head from value_head weight')
+            else:
+                _fetch_tensor(None, "lm_head.weight")
+                print_rank_0('fail to match lm_head in value_model')
+        else:
+            _fetch_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+
+    dist.barrier()
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")
--- a/verl/models/llama/megatron/checkpoint_utils/llama_loader_depracated.py
+++ b/verl/models/llama/megatron/checkpoint_utils/llama_loader_depracated.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+from packaging.version import Version
+import torch
+import time
+from typing import Dict, Any, Callable, Optional
+import torch.distributed as dist
+
+
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+
+    print(f'get megatron data parallel size: {mpu.get_data_parallel_world_size()}')
+
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
+                            pp_rank_idx * num_layers_per_model)
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+
+
+def load_state_dict_to_megatron_llama(state_dict,
+                                      wrapped_models,
+                                      config,
+                                      params_dtype,
+                                      is_value_model=False,
+                                      tie_word_embeddings=False):
+    """Load merged state_dict to sharded Megatron module in training.
+    """
+    from megatron.core import mpu
+    from verl.utils.megatron_utils import print_rank_0, unwrap_model
+    from megatron.core.transformer.module import Float16Module
+    from megatron.core import DistributedDataParallel as LocalDDP
+    from torch.nn.parallel import DistributedDataParallel as torchDDP
+
+    start_time = time.time()
+
+    def _get_gpt_model(model):
+        return model
+
+    def broadcast_params(module):
+        for param in module.parameters():
+            torch.distributed.broadcast(param.data,
+                                        src=mpu.get_data_parallel_src_rank(),
+                                        group=mpu.get_data_parallel_group())
+
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+
+    if torch.distributed.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f'num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}'
+
+    models = [None] * len(wrapped_models)
+
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        gpt_model_module = _get_gpt_model(models[i])
+        assert len(gpt_model_module.model.layers) == num_layers_per_model
+
+    def _broadcast_tensor(tensor, name) -> torch.Tensor:
+        """broadcast tensor from rank0 across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                weight = state_dict[name]
+                tensor_shape = weight.shape
+            else:
+                tensor_shape = None
+        else:
+            weight = None
+            tensor_shape = None
+
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        tensor_shape = obj_list[0]
+
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
+            return
+
+        if tensor is None:
+            tensor = torch.empty(
+                tensor_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        if torch.distributed.get_rank() == 0:
+            tensor.data.copy_(weight)
+        dist.broadcast(tensor, src=0, group=mp_group)
+
+    def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                full_weight = state_dict[name]
+
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert (tensor.shape == chunk_shape
+                   ), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+
+    def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+
+        if torch.distributed.get_rank() == 0:
+            if name in state_dict:
+                full_weight = state_dict[name]
+                if mutate_func is not None:
+                    full_weight = mutate_func(full_weight)
+                tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+                chunk_shape = tensor_chunk[0].shape
+            else:
+                chunk_shape = None
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+            return
+
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert (tensor.shape == chunk_shape
+                   ), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+
+        if torch.distributed.get_rank() == 0:
+            gate_weight = state_dict[gate_name]
+            up_weight = state_dict[up_name]
+            new_gate_up_weight = torch.empty(config.intermediate_size * 2,
+                                             config.hidden_size,
+                                             dtype=params_dtype,
+                                             device=torch.cuda.current_device())
+            for i in range(tp_size):
+                intermediate_size_tp = config.intermediate_size // tp_size
+                gate_weight_tp = gate_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
+                up_weight_tp = up_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
+                new_gate_up_weight[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)].copy_(
+                    torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+
+            tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
+            return
+
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert (
+                tensor.shape == chunk_shape
+            ), f"rank #{torch.distributed.get_rank() == 0:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+
+        if torch.distributed.get_rank() == 0:
+            assert (q_name in state_dict and k_name in state_dict and v_name in state_dict)
+            full_weight_q = state_dict[q_name]
+            full_weight_k = state_dict[k_name]
+            full_weight_v = state_dict[v_name]
+
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                new_weight_qkv = torch.empty(total_size * tp_size,
+                                             config.hidden_size,
+                                             dtype=params_dtype,
+                                             device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
+                    k_part = full_weight_k[i * kv_size_tp:(i + 1) * kv_size_tp]
+                    v_part = full_weight_v[i * kv_size_tp:(i + 1) * kv_size_tp]
+                    new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
+                                                                                        dim=0))
+
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                new_weight_qkv = torch.empty(total_size * tp_size,
+                                             config.hidden_size,
+                                             dtype=params_dtype,
+                                             device=torch.cuda.current_device())
+                for i in range(tp_size):
+                    q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
+                    start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+                    end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+                    k_part = full_weight_k[start_idx:end_idx]
+                    v_part = full_weight_v[start_idx:end_idx]
+                    new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
+                                                                                        dim=0))
+
+            tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+            chunk_shape = tensor_chunk[0].shape
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name, k_name, v_name}] not in state_dict, skip loading")
+            return
+
+        if tensor is None:
+            sync_tensor = torch.empty(
+                chunk_shape,
+                dtype=params_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        else:
+            assert (tensor.shape == chunk_shape
+                   ), f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
+            sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+        for i in range(tp_size):
+            if torch.distributed.get_rank() == 0:
+                sync_tensor.data.copy_(tensor_chunk[i])
+            dist.broadcast(sync_tensor, src=0, group=mp_group)
+            if (i == tp_rank) and (tensor is not None):
+                tensor.data.copy_(sync_tensor)
+
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("loading embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        embed_tokens_weight = None
+        if pp_rank == 0:
+            embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+        _broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"loading layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+
+            gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+            sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+
+            _broadcast_tensor(
+                sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.input_layernorm.weight",
+            )
+
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+            )
+
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.self_attn.o_proj.weight",
+                chunk_dim=1,
+            )
+
+            _broadcast_tensor(
+                sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.post_attention_layernorm.weight",
+            )
+
+            _broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+                                               f"{layer_name}.mlp.gate_proj.weight", f"{layer_name}.mlp.up_proj.weight")
+
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+                f"{layer_name}.mlp.down_proj.weight",
+                chunk_dim=1,
+            )
+        # Final Layernorm
+        # -------------------
+        print_rank_0("loading final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.model.norm, "weight", None),
+            "model.norm.weight",
+        )
+
+        print_rank_0("loading lm_head...")
+        lm_head_weight = None
+        if pp_rank + 1 == pp_size:
+            lm_head_weight = gpt_model_module.lm_head.weight
+
+        if is_value_model:
+            if 'lm_head.weight' in state_dict and state_dict['lm_head.weight'].shape[0] == 1:
+                _broadcast_tensor(lm_head_weight, "lm_head.weight")
+                print_rank_0('load lm_head weight')
+            elif 'reward_head.weight' in state_dict and state_dict['reward_head.weight'].shape[0] == 1:
+                _broadcast_tensor(lm_head_weight, "reward_head.weight")
+                print_rank_0('load lm_head from value_head weight')
+            else:
+                _broadcast_tensor(None, "lm_head.weight")
+                print_rank_0('fail to match lm_head in value_model')
+        else:
+            _broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+    dist.barrier()
+    # Broadcast weights inside data parallel groups
+    for wrapped_model in wrapped_models:
+        broadcast_params(wrapped_model)
+
+    torch.cuda.empty_cache()
+    print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")
\ No newline at end of file
--- a/verl/models/llama/megatron/checkpoint_utils/llama_saver.py
+++ b/verl/models/llama/megatron/checkpoint_utils/llama_saver.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import torch
+import torch.distributed as dist
+from megatron.core import mpu
+from megatron.core.distributed import DistributedDataParallel as LocalDDP
+from megatron.core.transformer.module import Float16Module
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+
+from verl.utils.megatron_utils import print_rank_0, unwrap_model
+
+
+def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
+    """given TP,DP,PP rank to get the global rank."""
+
+    tp_size = mpu.get_tensor_model_parallel_world_size()
+    dp_size = mpu.get_data_parallel_world_size()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    assert (tp_size * dp_size * pp_size == torch.distributed.get_world_size()
+           ), f"{tp_size} x {dp_size} x {pp_size} != {torch.distributed.get_world_size()}"
+    # We only support TP-DP-PP grouping, for correctness when resharding
+    return (pp_rank * dp_size + dp_rank) * tp_size + tp_rank
+
+
+def _megatron_calc_layer_map(config):
+    """Calculate the mapping of global layer_idx to local layer_idx
+    Returns:
+        layer_map (Dict: int -> tuple(int, int, int)):
+            mapping from the global layer index to
+            a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+    """
+    from megatron.core import mpu
+
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+
+    layer_map = dict()
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+    for pp_rank_idx in range(pp_size):
+        for virtual_pp_rank_idx in range(virtual_pp_size):
+            layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
+                            pp_rank_idx * num_layers_per_model)
+            for layer_idx in range(num_layers_per_model):
+                layer_map[layer_offset + layer_idx] = (
+                    pp_rank_idx,
+                    virtual_pp_rank_idx,
+                    layer_idx,
+                )
+    return layer_map
+
+
+def merge_megatron_ckpt_llama(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
+    """Merge sharded parameters of a Megatron module into a merged checkpoint.
+
+    Args:
+        wrapped_models (list of megatron.core.distributed.DistributedDataParallel):
+            The local DDP wrapped megatron modules.
+        config (str or None):
+            HF config for model
+        dtype: model params type
+        is_value_model: if model is value model
+        tie_word_embeddings: tie_word_embeddings, not used in llama, only to keep same interface with qwen2
+    Returns:
+        state_dict (dict):
+            The merged state_dict in rank 0, and an empty dictionary in other ranks.
+    """
+    start_time = time.time()
+
+    def _get_gpt_model(model):
+        return model
+
+    dp_rank = mpu.get_data_parallel_rank()
+    pp_size = mpu.get_pipeline_model_parallel_world_size()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+    mp_group = mpu.get_model_parallel_group()
+
+    if dist.get_rank() == 0:
+        assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+        assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+        assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+
+    if not isinstance(wrapped_models, (list, tuple)):
+        wrapped_models = list(wrapped_models)
+
+    assert len(wrapped_models) == virtual_pp_size
+    num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+    assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+    models = [None] * len(wrapped_models)
+
+    for i, wrapped_model in enumerate(wrapped_models):
+        models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+        assert len(models[i].model.layers
+                  ) == num_layers_per_model, 'len model layers {} not equal to num_layers_per_model {}'.format(
+                      len(models[i].model.layers), num_layers_per_model)
+
+    state_dict = dict()
+
+    def _get_cpu_tensor(tensor: torch.Tensor):
+        if tensor is None:
+            return None
+        if tensor.device == torch.device("cpu"):
+            return tensor.detach().clone()
+        return tensor.detach().cpu()
+
+    def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+        if torch.distributed.get_rank() == src_rank:
+            if tensor is None:
+                weight = None
+                tensor_shape = None
+            else:
+                weight = tensor
+                tensor_shape = weight.shape
+        else:
+            weight = None
+            tensor_shape = None
+
+        obj_list = [tensor_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        tensor_shape = obj_list[0]
+
+        if tensor_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tensor:[{name}] not exist, skip collect")
+            return
+
+        if weight is None:
+            weight = torch.empty(
+                tensor_shape,
+                dtype=dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+
+        dist.broadcast(weight, src=src_rank, group=mp_group)
+
+        if torch.distributed.get_rank() == 0:
+            state_dict[name] = _get_cpu_tensor(weight)
+
+    def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+        if torch.distributed.get_rank() == src_rank:
+            chunk_shape = tensor.shape
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
+            return
+
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+
+        chunk_tensors = [None] * tp_size
+
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
+            if mutate_func is not None:
+                full_tensor = mutate_func(full_tensor)
+            state_dict[name] = full_tensor
+
+    def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+        if torch.distributed.get_rank() == src_rank:
+            chunk_shape = tensor.shape
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
+            return
+
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+
+        chunk_tensors = [None] * tp_size
+
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            intermediate_size_tp = config.intermediate_size // tp_size
+            gate_weight_list = []
+            up_weight_list = []
+            for i in range(tp_size):
+                gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)]
+                gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
+                up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
+                gate_weight_list.append(gate_weight_tp)
+                up_weight_list.append(up_weight_tp)
+
+            state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
+            state_dict[up_name] = torch.cat(up_weight_list, dim=0)
+
+    def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
+        """broadcast tensor in tp shards across mp_group"""
+        nonlocal state_dict
+        nonlocal mp_group
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+        if torch.distributed.get_rank() == src_rank:
+            chunk_shape = tensor.shape
+        else:
+            chunk_shape = None
+
+        obj_list = [chunk_shape]
+        dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+        chunk_shape = obj_list[0]
+        if chunk_shape is None:
+            # all or none ranks in the mp_group should reach here
+            print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
+            return
+
+        buffer_tensor = torch.empty(
+            chunk_shape,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+
+        chunk_tensors = [None] * tp_size
+
+        for i in range(tp_size):
+            cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+            sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+            dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+
+            if torch.distributed.get_rank() == 0:
+                chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+
+        if torch.distributed.get_rank() == 0:
+            full_tensor = torch.concat(chunk_tensors, dim=0)
+            q_weight_list = []
+            k_weight_list = []
+            v_weight_list = []
+            hidden_size_per_head = config.hidden_size // config.num_attention_heads
+
+            if config.num_key_value_heads >= tp_size:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
+                    q_part = qkv_part[:q_size_tp]
+                    k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
+                    v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
+                    q_weight_list.append(q_part)
+                    k_weight_list.append(k_part)
+                    v_weight_list.append(v_part)
+            else:
+                q_size_tp = config.hidden_size // tp_size
+                kv_size_tp = hidden_size_per_head
+                total_size = q_size_tp + 2 * kv_size_tp
+                for i in range(tp_size):
+                    qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
+                    q_part = qkv_part[:q_size_tp]
+                    k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
+                    v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
+                    q_weight_list.append(q_part)
+                    if i * config.num_key_value_heads % tp_size == 0:
+                        k_weight_list.append(k_part)
+                        v_weight_list.append(v_part)
+
+            state_dict[q_name] = torch.cat(q_weight_list, dim=0)
+            state_dict[k_name] = torch.cat(k_weight_list, dim=0)
+            state_dict[v_name] = torch.cat(v_weight_list, dim=0)
+
+    # empty cache before collecting weights
+    torch.cuda.empty_cache()
+    # Embeddings
+    # -------------------
+    if dp_rank == 0:
+        # Embeddings
+        # -------------------
+        print_rank_0("collecting embeddings...")
+        gpt_model_module = _get_gpt_model(models[0])
+        _broadcast_tp_shard_tensor(
+            gpt_model_module.model.embed_tokens.weight if pp_rank == 0 else None,
+            "model.embed_tokens.weight",
+            src_pp_rank=0,
+        )
+
+        # Transformer layers
+        # -------------------
+        layer_map = _megatron_calc_layer_map(config)
+        for layer in range(config.num_hidden_layers):
+            print_rank_0(f"collecting layer #{layer}...")
+            layer_name = f"model.layers.{layer}"
+            src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
+
+            gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
+            sync_layer = gpt_model_module.model.layers[src_layer_idx]
+
+            _broadcast_tensor(
+                sync_layer.input_layernorm.weight,
+                f"{layer_name}.input_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+
+            _broadcast_tp_shard_tensor_qkv(
+                sync_layer.self_attn.qkv_proj.weight,
+                f"{layer_name}.self_attn.q_proj.weight",
+                f"{layer_name}.self_attn.k_proj.weight",
+                f"{layer_name}.self_attn.v_proj.weight",
+                src_pp_rank=src_pp_rank,
+            )
+
+            _broadcast_tp_shard_tensor(
+                sync_layer.self_attn.o_proj.weight,
+                f"{layer_name}.self_attn.o_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+
+            _broadcast_tensor(
+                sync_layer.post_attention_layernorm.weight,
+                f"{layer_name}.post_attention_layernorm.weight",
+                src_pp_rank=src_pp_rank,
+            )
+
+            _broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight,
+                                               f"{layer_name}.mlp.gate_proj.weight",
+                                               f"{layer_name}.mlp.up_proj.weight",
+                                               src_pp_rank=src_pp_rank)
+
+            _broadcast_tp_shard_tensor(
+                sync_layer.mlp.down_proj.weight,
+                f"{layer_name}.mlp.down_proj.weight",
+                concat_dim=1,
+                src_pp_rank=src_pp_rank,
+            )
+
+        # Final Layernorm
+        # -------------------
+        print_rank_0("collecting final layernorm...")
+        gpt_model_module = _get_gpt_model(models[-1])
+        _broadcast_tensor(
+            getattr(gpt_model_module.model.norm, "weight", None),
+            "model.norm.weight",
+            src_pp_rank=pp_size - 1,
+        )
+
+        print_rank_0("collecting lm_head...")
+
+        if is_value_model:
+            if pp_rank == pp_size - 1:
+                print(f'gpt_model_module.lm_head.weight: {gpt_model_module.lm_head.weight.shape}')
+            _broadcast_tensor(gpt_model_module.lm_head.weight if pp_rank == pp_size - 1 else None,
+                              "lm_head.weight",
+                              src_pp_rank=pp_size - 1)
+            _broadcast_tensor(gpt_model_module.reward_head.weight if pp_rank == pp_size - 1 and
+                              getattr(gpt_model_module, "reward_weight", None) is not None else None,
+                              "reward_head.weight",
+                              src_pp_rank=pp_size - 1)
+
+        else:
+            _broadcast_tp_shard_tensor(
+                getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
+                "lm_head.weight",
+                src_pp_rank=pp_size - 1,
+            )
+
+    dist.barrier()
+
+    torch.cuda.empty_cache()
+    if torch.distributed.get_rank() == 0:
+        if dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+            print(f'Unknown/unsupported dtype to save: {dtype}"')
+            exit(1)
+        for k, v in state_dict.items():
+            if dtype != v.dtype:
+                state_dict[k] = v.to(dtype)
+
+    print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
+    return state_dict
--- a/verl/models/llama/megatron/layers/__init__.py
+++ b/verl/models/llama/megatron/layers/__init__.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .parallel_attention import ParallelLlamaAttention
+from .parallel_decoder import ParallelLlamaDecoderLayer, ParallelLlamaDecoderLayerRmPad
+from .parallel_mlp import ParallelLlamaMLP
+from .parallel_rmsnorm import ParallelLlamaRMSNorm
--- a/verl/models/llama/megatron/layers/parallel_attention.py
+++ b/verl/models/llama/megatron/layers/parallel_attention.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from megatron.core import parallel_state as mpu
+from megatron.core import tensor_parallel
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers import LlamaConfig
+from verl.models.llama.megatron.layers.parallel_linear import QKVParallelLinear
+
+from verl.utils.megatron import tensor_parallel as tp_utils
+
+
+class LlamaRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+                                (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class LlamaLlama3ScalingRotaryEmbedding(LlamaRotaryEmbedding):
+
+    def __init__(self, dim, config, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__(dim, max_position_embeddings, base, device)
+
+        self.factor = config.rope_scaling["factor"]  # `8` in the original implementation
+        self.high_freq_factor = config.rope_scaling["high_freq_factor"]  # `1` in the original implementation
+        self.low_freq_factor = config.rope_scaling["low_freq_factor"]  # `4` in the original implementation
+        self.old_context_len = config.rope_scaling[
+            "original_max_position_embeddings"]  # `8192` in the original implementation
+
+        low_freq_wavelen = self.old_context_len / self.low_freq_factor
+        high_freq_wavelen = self.old_context_len / self.high_freq_factor
+
+        wavelen = 2 * math.pi / self.inv_freq
+        # wavelen < high_freq_wavelen: do nothing; wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen, self.inv_freq / self.factor, self.inv_freq)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (self.old_context_len / wavelen - self.low_freq_factor) / (self.high_freq_factor -
+                                                                                   self.low_freq_factor)
+        smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / self.factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+        inv_freq = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class ParallelLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+        super().__init__()
+        self.config = config
+        self.megatron_config = megatron_config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        # assign values after tp
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0, f'num_head must be divisible by tp_size. Got num_head={self.num_heads}, tp_size={tp_size}'
+        assert self.num_key_value_heads % tp_size == 0, \
+            f'num_key_value_heads must be divisible by tp_size. Got num_key_value_heads={self.num_key_value_heads}, tp_size={tp_size}'
+
+        self.num_heads_per_tp = self.num_heads // tp_size
+        self.num_key_value_heads_per_tp = self.num_key_value_heads // tp_size
+        self.hidden_size_per_tp = self.hidden_size // tp_size
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
+
+        column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+        row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
+
+        if megatron_config is not None:
+            assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+            assert row_kwargs.get('config', False), 'must have ModelParallelConfig'
+            tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
+            tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
+
+        # [self.q_size, self.k_size, self.v_size]
+        self.qkv_proj = QKVParallelLinear(input_size=self.hidden_size,
+                                          num_heads=self.num_heads,
+                                          num_key_value_heads=self.num_key_value_heads,
+                                          head_dim=self.head_dim,
+                                          bias=config.attention_bias,
+                                          gather_output=False,
+                                          skip_bias_add=False,
+                                          **column_kwargs)
+
+        self.q_size = self.num_heads_per_tp * self.head_dim
+        self.k_size = self.num_key_value_heads_per_tp * self.head_dim
+        self.v_size = self.num_key_value_heads_per_tp * self.head_dim
+
+        self.o_proj = tensor_parallel.RowParallelLinear(input_size=self.num_heads * self.head_dim,
+                                                        output_size=self.hidden_size,
+                                                        bias=config.attention_bias,
+                                                        input_is_parallel=True,
+                                                        skip_bias_add=False,
+                                                        **row_kwargs)
+
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            rope_type_key = "type" if "type" in self.config.rope_scaling else "rope_type"
+            scaling_type = self.config.rope_scaling[rope_type_key]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "llama3":
+                self.rotary_emb = LlamaLlama3ScalingRotaryEmbedding(
+                    self.head_dim,
+                    self.config,
+                    max_position_embeddings=self.max_position_embeddings,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)[0]
+        query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads_per_tp, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads_per_tp, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads_per_tp, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}")
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads_per_tp, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads_per_tp, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size_per_tp)
+        attn_output = self.o_proj(attn_output)[0]
+        return attn_output
+
+
+"""
+Remove padding Attention
+- Using Flash-attn 2
+- Compatible with sequence parallel
+"""
+
+from transformers.utils import is_flash_attn_2_available
+import torch.nn.functional as F
+
+from einops import rearrange
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+def apply_rotary_pos_emb_rmpad(q, k, cos, sin, position_ids, indices, sequence_length):
+    batch_size = position_ids.shape[0]
+
+    q = pad_input(q, indices, batch_size, sequence_length)  # (batch_size, seqlen, num_head, head_dim)
+    k = pad_input(k, indices, batch_size, sequence_length)
+    cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    q_embed = index_first_axis(rearrange(q_embed, "b s ... -> (b s) ..."), indices)
+    k_embed = index_first_axis(rearrange(k_embed, "b s ... -> (b s) ..."), indices)
+
+    return q_embed, k_embed
+
+
+from flash_attn.layers.rotary import apply_rotary_emb
+
+
+# use flash-attn rotary embeddings with rmpad
+# cos/sin shoudl be: (seq_length, rotary_dim / 2)
+def apply_rotary_pos_emb_rmpad_flash(q, k, cos, sin, cu_seqlens, max_seqlen):
+    q_embed = apply_rotary_emb(q,
+                               cos,
+                               sin,
+                               interleaved=False,
+                               inplace=False,
+                               cu_seqlens=cu_seqlens,
+                               max_seqlen=max_seqlen)
+    k_embed = apply_rotary_emb(k,
+                               cos,
+                               sin,
+                               interleaved=False,
+                               inplace=False,
+                               cu_seqlens=cu_seqlens,
+                               max_seqlen=max_seqlen)
+    return q_embed, k_embed
+
+
+class ParallelLlamaAttentionRmPad(ParallelLlamaAttention):
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                position_ids: Optional[torch.LongTensor] = None,
+                sequence_length: int = None,
+                indices: torch.Tensor = None,
+                cu_seqlens: torch.Tensor = None,
+                max_seqlen_in_batch: int = None):
+        total_nnz, _, _ = hidden_states.size()  # This is the total_nnz padded after sequence parallel
+
+        if self.megatron_config.sequence_parallel:
+            total_nnz = total_nnz * mpu.get_tensor_model_parallel_world_size()
+
+        qkv = self.qkv_proj(hidden_states)[0]
+        query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size],
+                                                           dim=-1)  # (total_nnz, 1, hidden_size)
+
+        if self.megatron_config.sequence_parallel:
+            sequence_parallel_pad = total_nnz - cu_seqlens[-1]
+            total_nnz = cu_seqlens[-1]  # total_nnz before sp padding
+            query_states = query_states[:total_nnz]
+            key_states = key_states[:total_nnz]
+            value_states = value_states[:total_nnz]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dime x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(total_nnz, self.num_heads_per_tp, self.head_dim)
+        key_states = key_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
+        value_states = value_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
+
+        cos, sin = self.rotary_emb(value_states, seq_len=sequence_length)
+        cos, sin = cos[:, :cos.shape[1] // 2], sin[:, :sin.shape[1] // 2]  # flash attn only needs half
+        query_states, key_states = apply_rotary_pos_emb_rmpad_flash(query_states,
+                                                                    key_states,
+                                                                    cos,
+                                                                    sin,
+                                                                    cu_seqlens=cu_seqlens,
+                                                                    max_seqlen=max_seqlen_in_batch)
+        # query_states, key_states = apply_rotary_pos_emb_rmpad(query_states, key_states, cos, sin, position_ids, indices,
+
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            query_states = query_states.to(torch.float16)
+            key_states = key_states.to(torch.float16)
+            value_states = value_states.to(torch.float16)
+
+        attn_output_unpad = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen_in_batch,
+            max_seqlen_k=max_seqlen_in_batch,
+            dropout_p=dropout_rate,
+            softmax_scale=None,
+            causal=True,
+        )
+
+        attn_output_unpad = attn_output_unpad.to(input_dtype)
+        attn_output_unpad = attn_output_unpad.reshape(total_nnz, 1, self.hidden_size_per_tp).contiguous()
+
+        # sequence parallel reduce_scatter is performed inside RowColumnParallel if enabled
+        # Here we need to repad
+        if self.megatron_config.sequence_parallel:
+            attn_output_unpad = F.pad(attn_output_unpad, pad=(0, 0, 0, 0, 0, sequence_parallel_pad))
+
+        attn_output_unpad = self.o_proj(attn_output_unpad)[0]
+        return attn_output_unpad
--- a/verl/models/llama/megatron/layers/parallel_decoder.py
+++ b/verl/models/llama/megatron/layers/parallel_decoder.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+from megatron.core import ModelParallelConfig
+
+from .parallel_attention import ParallelLlamaAttention, ParallelLlamaAttentionRmPad
+from .parallel_mlp import ParallelLlamaMLP
+from .parallel_rmsnorm import ParallelLlamaRMSNorm
+
+from verl.utils.megatron_utils import TransformerConfig, convert_config
+
+
+class ParallelLlamaDecoderLayer(nn.Module):
+
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, layer_idx: int):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.self_attn = ParallelLlamaAttention(config=config, megatron_config=megatron_config)
+
+        self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
+        self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+        self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Note: sequence parallel is hidden inside ColumnParallelLinear
+        # reduce scatter is hidden inside RowParallelLinear
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+        # TODO: add sequence parallel operator reduce_scatter here
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        # TODO: add sequence parallel operator all_gather here
+
+        hidden_states = self.mlp(hidden_states)
+
+        # TODO: add sequence parallel operator reduce_scatter here
+
+        hidden_states = residual + hidden_states
+
+        outputs = hidden_states
+
+        return outputs
+
+
+class ParallelLlamaDecoderLayerRmPad(nn.Module):
+
+    def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, layer_idx: int):
+        super().__init__()
+        self.config: TransformerConfig = convert_config(config, megatron_config)
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.self_attn = ParallelLlamaAttentionRmPad(config=config, megatron_config=megatron_config)
+
+        self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
+        self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+        self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        sequence_length: int = None,
+        indices: torch.Tensor = None,
+        cu_seqlens: int = None,
+        max_seqlen_in_batch: int = None
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states  # (total_nnz // sp, 1, hidden_size)
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        # (total_nnz // sp, 1, hidden_size) -> all-gather (total_nnz, 1, hidden_size)
+        # -> col + row -> reduce-scatter -> (total_nnz // sp, 1, hidden_size)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       position_ids=position_ids,
+                                       sequence_length=sequence_length,
+                                       indices=indices,
+                                       cu_seqlens=cu_seqlens,
+                                       max_seqlen_in_batch=max_seqlen_in_batch)
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        # shape changes same as attn
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = hidden_states
+
+        return outputs
--- a/verl/models/llama/megatron/layers/parallel_linear.py
+++ b/verl/models/llama/megatron/layers/parallel_linear.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
+
+from typing import Optional, Tuple
+
+from megatron.core import tensor_parallel
+
+
+class QKVParallelLinear(tensor_parallel.ColumnParallelLinear):
+
+    def __init__(self,
+                 input_size,
+                 num_heads,
+                 num_key_value_heads,
+                 head_dim,
+                 *,
+                 bias=True,
+                 gather_output=True,
+                 skip_bias_add=False,
+                 **kwargs):
+        # Keep input parameters, and already restrict the head numbers
+        self.input_size = input_size
+        self.q_output_size = num_heads * head_dim
+        self.kv_output_size = num_key_value_heads * head_dim
+        self.head_dim = head_dim
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+
+        input_size = self.input_size
+        output_size = (num_heads + 2 * num_key_value_heads) * self.head_dim
+
+        super().__init__(input_size=input_size,
+                         output_size=output_size,
+                         bias=bias,
+                         gather_output=gather_output,
+                         skip_bias_add=skip_bias_add,
+                         **kwargs)
+
+
+class MergedColumnParallelLinear(tensor_parallel.ColumnParallelLinear):
+
+    def __init__(self,
+                 input_size,
+                 gate_ouput_size,
+                 up_output_size,
+                 *,
+                 bias=True,
+                 gather_output=True,
+                 skip_bias_add=False,
+                 **kwargs):
+        # Keep input parameters, and already restrict the head numbers
+        self.input_size = input_size
+        self.output_size = gate_ouput_size + up_output_size
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+
+        super().__init__(input_size=self.input_size,
+                         output_size=self.output_size,
+                         bias=bias,
+                         gather_output=gather_output,
+                         skip_bias_add=skip_bias_add,
+                         **kwargs)
+
+
+import torch
+
+
+class LinearForLastLayer(torch.nn.Linear):
+
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config,
+        bias=True,
+    ):
+        super().__init__(in_features=input_size, out_features=output_size, bias=bias)
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel:
+            setattr(self.weight, 'sequence_parallel', True)
+
+    def forward(
+        self,
+        input_,
+        weight=None,
+        runtime_gather_output=None,
+    ):
+        logits = super().forward(input_)
+        logits = logits.float()
+        if self.sequence_parallel:
+            logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
+        return logits, None