Commit f87b35b2 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2648 failed with stages
in 0 seconds
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import pytest
import torch
from tensordict import TensorDict
from verl.protocol import union_tensor_dict, union_numpy_dict
from verl import DataProto
import numpy as np
def test_union_tensor_dict():
obs = torch.randn(100, 10)
data1 = TensorDict({'obs': obs, 'act': torch.randn(100, 3)}, batch_size=[100])
data2 = TensorDict({'obs': obs, 'next_obs': torch.randn(100, 10), 'rew': torch.randn(100)}, batch_size=[100])
data_with_copied_obs = TensorDict({
'obs': obs.clone(),
'next_obs': torch.randn(100, 10),
'rew': torch.randn(100)
},
batch_size=[100])
data = union_tensor_dict(data1, data2)
with pytest.raises(AssertionError):
data = union_tensor_dict(data1, data_with_copied_obs)
data = np.random.random(100)
data2 = [float('nan') for _ in range(99)]
data2.append('nan')
data2 = np.array(data2, dtype=object)
data3 = np.tile(data2, (2, 1))
a = {'a': data, 'b': data2, 'c': data3}
b = {'a': data, 'b': data2, 'c': data3}
b_ = {'a': np.random.random(100)}
union_numpy_dict(a, b)
with pytest.raises(AssertionError):
union_numpy_dict(a, b_)
def test_tensor_dict_constructor():
obs = torch.randn(100, 10)
act = torch.randn(100, 10, 3)
data = DataProto.from_dict(tensors={'obs': obs, 'act': act})
assert data.batch.batch_size == torch.Size([100])
with pytest.raises(AssertionError):
data = DataProto.from_dict(tensors={'obs': obs, 'act': act}, num_batch_dims=2)
with pytest.raises(AssertionError):
data = DataProto.from_dict(tensors={'obs': obs, 'act': act}, num_batch_dims=3)
def test_tensor_dict_make_iterator():
obs = torch.randn(100, 10)
labels = [random.choice(['abc', 'cde']) for _ in range(100)]
dataset = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels})
data_iter_1 = dataset.make_iterator(mini_batch_size=10, epochs=2, seed=1)
data_list_1 = []
for data in data_iter_1:
data_list_1.append(data)
data_iter_2 = dataset.make_iterator(mini_batch_size=10, epochs=2, seed=1)
data_list_2 = []
for data in data_iter_2:
data_list_2.append(data)
for data1, data2 in zip(data_list_1, data_list_2):
assert isinstance(data1, DataProto)
assert isinstance(data2, DataProto)
result = torch.all(torch.eq(data1.batch['obs'], data2.batch['obs']))
if not result.item():
print(data1.batch['obs'])
print(data2.batch['obs'])
assert False
non_tensor_result = np.all(np.equal(data1.non_tensor_batch['labels'], data2.non_tensor_batch['labels']))
if not non_tensor_result.item():
print(data1.non_tensor_batch['labels'])
print(data2.non_tensor_batch['labels'])
def test_reorder():
obs = torch.tensor([1, 2, 3, 4, 5, 6])
labels = ['a', 'b', 'c', 'd', 'e', 'f']
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'name': 'abdce'})
data.reorder(torch.tensor([3, 4, 2, 0, 1, 5]))
assert torch.all(torch.eq(data.batch['obs'], torch.tensor([4, 5, 3, 1, 2, 6])))
assert np.all(data.non_tensor_batch['labels'] == np.array(['d', 'e', 'c', 'a', 'b', 'f']))
assert data.meta_info == {'name': 'abdce'}
def test_chunk_concat():
obs = torch.tensor([1, 2, 3, 4, 5, 6])
labels = ['a', 'b', 'c', 'd', 'e', 'f']
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'name': 'abdce'})
with pytest.raises(AssertionError):
data.chunk(5)
data_split = data.chunk(2)
assert len(data_split) == 2
assert torch.all(torch.eq(data_split[0].batch['obs'], torch.tensor([1, 2, 3])))
assert np.all(data_split[0].non_tensor_batch['labels'] == np.array(['a', 'b', 'c']))
assert data_split[0].meta_info == {'name': 'abdce'}
assert torch.all(torch.eq(data_split[1].batch['obs'], torch.tensor([4, 5, 6])))
assert np.all(data_split[1].non_tensor_batch['labels'] == np.array(['d', 'e', 'f']))
assert data_split[1].meta_info == {'name': 'abdce'}
concat_data = DataProto.concat(data_split)
assert torch.all(torch.eq(concat_data.batch['obs'], data.batch['obs']))
assert np.all(concat_data.non_tensor_batch['labels'] == data.non_tensor_batch['labels'])
assert concat_data.meta_info == data.meta_info
def test_pop():
obs = torch.randn(100, 10)
act = torch.randn(100, 3)
dataset = DataProto.from_dict({'obs': obs, 'act': act}, meta_info={'2': 2, '1': 1})
poped_dataset = dataset.pop(batch_keys=['obs'], meta_info_keys=['2'])
assert poped_dataset.batch.keys() == {'obs'}
assert poped_dataset.meta_info.keys() == {'2'}
assert dataset.batch.keys() == {'act'}
assert dataset.meta_info.keys() == {'1'}
def test_repeat():
# Create a DataProto object with some batch and non-tensor data
obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
labels = ['a', 'b', 'c']
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
# Test interleave=True
repeated_data_interleave = data.repeat(repeat_times=2, interleave=True)
expected_obs_interleave = torch.tensor([[1, 2], [1, 2], [3, 4], [3, 4], [5, 6], [5, 6]])
expected_labels_interleave = ['a', 'a', 'b', 'b', 'c', 'c']
assert torch.all(torch.eq(repeated_data_interleave.batch['obs'], expected_obs_interleave))
assert (repeated_data_interleave.non_tensor_batch['labels'] == expected_labels_interleave).all()
assert repeated_data_interleave.meta_info == {'info': 'test_info'}
# Test interleave=False
repeated_data_no_interleave = data.repeat(repeat_times=2, interleave=False)
expected_obs_no_interleave = torch.tensor([[1, 2], [3, 4], [5, 6], [1, 2], [3, 4], [5, 6]])
expected_labels_no_interleave = ['a', 'b', 'c', 'a', 'b', 'c']
assert torch.all(torch.eq(repeated_data_no_interleave.batch['obs'], expected_obs_no_interleave))
assert (repeated_data_no_interleave.non_tensor_batch['labels'] == expected_labels_no_interleave).all()
assert repeated_data_no_interleave.meta_info == {'info': 'test_info'}
def test_dataproto_pad_unpad():
obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
labels = ['a', 'b', 'c']
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
padded_data, pad_size = pad_dataproto_to_divisor(data, size_divisor=2)
assert pad_size == 1
expected_obs = torch.tensor([[1, 2], [3, 4], [5, 6], [1, 2]])
expected_labels = ['a', 'b', 'c', 'a']
assert torch.all(torch.eq(padded_data.batch['obs'], expected_obs))
assert (padded_data.non_tensor_batch['labels'] == expected_labels).all()
assert padded_data.meta_info == {'info': 'test_info'}
unpadd_data = unpad_dataproto(padded_data, pad_size=pad_size)
assert torch.all(torch.eq(unpadd_data.batch['obs'], obs))
assert (unpadd_data.non_tensor_batch['labels'] == labels).all()
assert unpadd_data.meta_info == {'info': 'test_info'}
padded_data, pad_size = pad_dataproto_to_divisor(data, size_divisor=3)
assert pad_size == 0
expected_obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
expected_labels = ['a', 'b', 'c']
assert torch.all(torch.eq(padded_data.batch['obs'], expected_obs))
assert (padded_data.non_tensor_batch['labels'] == expected_labels).all()
assert padded_data.meta_info == {'info': 'test_info'}
unpadd_data = unpad_dataproto(padded_data, pad_size=pad_size)
assert torch.all(torch.eq(unpadd_data.batch['obs'], obs))
assert (unpadd_data.non_tensor_batch['labels'] == labels).all()
assert unpadd_data.meta_info == {'info': 'test_info'}
padded_data, pad_size = pad_dataproto_to_divisor(data, size_divisor=7)
assert pad_size == 4
expected_obs = torch.tensor([[1, 2], [3, 4], [5, 6], [1, 2], [3, 4], [5, 6], [1, 2]])
expected_labels = ['a', 'b', 'c', 'a', 'b', 'c', 'a']
assert torch.all(torch.eq(padded_data.batch['obs'], expected_obs))
assert (padded_data.non_tensor_batch['labels'] == expected_labels).all()
assert padded_data.meta_info == {'info': 'test_info'}
unpadd_data = unpad_dataproto(padded_data, pad_size=pad_size)
assert torch.all(torch.eq(unpadd_data.batch['obs'], obs))
assert (unpadd_data.non_tensor_batch['labels'] == labels).all()
assert unpadd_data.meta_info == {'info': 'test_info'}
def test_dataproto_fold_unfold():
from verl.protocol import fold_batch_dim, unfold_batch_dim, DataProto
obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
labels = ['a', 'b', 'c']
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
data1 = data.repeat(repeat_times=2, interleave=True)
data2 = fold_batch_dim(data1, new_batch_size=3)
torch.testing.assert_close(data2.batch['obs'], torch.tensor([[[1, 2], [1, 2]], [[3, 4], [3, 4]], [[5, 6], [5, 6]]]))
assert (data2.non_tensor_batch['labels'] == [['a', 'a'], ['b', 'b'], ['c', 'c']]).all()
data2.reorder(indices=torch.tensor([1, 2, 0]))
data3 = unfold_batch_dim(data2, batch_dims=2)
torch.testing.assert_close(data3.batch['obs'], torch.tensor([[3, 4], [3, 4], [5, 6], [5, 6], [1, 2], [1, 2]]))
assert (data3.non_tensor_batch['labels'] == ['b', 'b', 'c', 'c', 'a', 'a']).all()
assert data3.meta_info == {'info': 'test_info'}
def test_torch_save_data_proto():
obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
labels = ['a', 'b', 'c']
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
data.save_to_disk('test_data.pt')
loaded_data = DataProto.load_from_disk('test_data.pt')
assert torch.all(torch.eq(loaded_data.batch['obs'], data.batch['obs']))
assert (loaded_data.non_tensor_batch['labels'] == data.non_tensor_batch['labels']).all()
assert loaded_data.meta_info == data.meta_info
import os
os.remove('test_data.pt')
def test_len():
obs = torch.tensor([[1, 2], [3, 4], [5, 6]])
labels = np.array(['a', 'b', 'c'], dtype=object)
data = DataProto.from_dict(tensors={'obs': obs}, non_tensors={'labels': labels}, meta_info={'info': 'test_info'})
assert len(data) == 3
data = DataProto(batch=None, non_tensor_batch={'labels': labels}, meta_info={'info': 'test_info'})
assert len(data) == 3
data = DataProto(batch=None, non_tensor_batch={}, meta_info={'info': 'test_info'})
assert len(data) == 0
data = DataProto(batch=None, non_tensor_batch=None, meta_info={'info': 'test_info'})
assert len(data) == 0
def test_seqlen_balancing():
from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx
input_ids = torch.randint(low=0, high=10, size=(20, 100))
from verl.utils.model import create_random_mask
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0.1,
max_ratio_of_valid_token=0.9,
min_ratio_of_valid_token=0.5)
data = {'input_ids': input_ids, 'attention_mask': attention_mask}
dataproto = DataProto.from_single_dict(data)
micro_batches, micro_bsz_idx_lst = rearrange_micro_batches(dataproto.batch, max_token_len=300)
batch = torch.cat(micro_batches)
micro_bsz_idx = []
for idx in micro_bsz_idx_lst:
micro_bsz_idx.extend(idx)
reverse_idx_map = get_reverse_idx(micro_bsz_idx)
reverse_idx_map = torch.tensor(reverse_idx_map)
new_batch = batch[reverse_idx_map]
torch.testing.assert_close(new_batch, dataproto.batch)
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test the MultiTurnSFTDataset implementation
"""
import os
import pandas as pd
import torch
from transformers import AutoTokenizer
from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
def test_multiturn_sft_dataset():
print("Starting test...")
# Create a temporary parquet file with test data
test_data = {
'messages': [[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is 2+2?"
}, {
"role": "assistant",
"content": "2+2 equals 4."
}, {
"role": "user",
"content": "And what is 4+4?"
}, {
"role": "assistant",
"content": "4+4 equals 8."
}],
[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Tell me a joke."
}, {
"role": "assistant",
"content": "Why did the chicken cross the road?"
}, {
"role": "user",
"content": "Why?"
}, {
"role": "assistant",
"content": "To get to the other side!"
}]]
}
# Create test directory if it doesn't exist
os.makedirs('test_data', exist_ok=True)
test_file = 'test_data/test.parquet'
# Save test data to parquet
df = pd.DataFrame(test_data)
df.to_parquet(test_file)
# Initialize tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-7B-Instruct')
config = {'max_length': 512, 'truncation': 'error', 'multiturn': {'messages_key': 'messages'}}
dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config)
# Test 1: Dataset Length
assert len(dataset) == 2, f"Expected dataset length 2, got {len(dataset)}"
# Get items for testing
item0 = dataset[0] # Math conversation
item1 = dataset[1] # Joke conversation
# Test 2: Required Keys and Types
required_keys = ['input_ids', 'attention_mask', 'position_ids', 'loss_mask']
for key in required_keys:
assert key in item0, f"Missing key {key} in dataset item"
assert isinstance(item0[key], torch.Tensor), f"Expected torch.Tensor for {key}"
assert item0[key].dtype == torch.long, f"Expected torch.long for {key}, got {item0[key].dtype}"
# Test 3: Shape Consistency
assert item0['loss_mask'].shape == item0['input_ids'].shape, \
"Loss mask shape doesn't match input_ids shape"
assert item0['attention_mask'].shape == item0['input_ids'].shape, \
"Attention mask shape doesn't match input_ids shape"
assert item0['position_ids'].shape == item0['input_ids'].shape, \
"Position IDs shape doesn't match input_ids shape"
# Test 4: Loss Mask Pattern - Math Conversation
loss_mask0 = item0['loss_mask']
input_ids0 = item0['input_ids']
# Find assistant response positions
assistant_positions0 = torch.where(loss_mask0 == 1)[0]
assert len(assistant_positions0) > 0, "No assistant positions found in loss mask"
# Decode and verify assistant responses
assistant_text0 = tokenizer.decode(input_ids0[loss_mask0 == 1])
print(f"Math conversation assistant text: {assistant_text0}")
assert "2+2 equals 4" in assistant_text0, "First assistant response not found"
assert "4+4 equals 8" in assistant_text0, "Second assistant response not found"
# Test 5: Loss Mask Pattern - Joke Conversation
loss_mask1 = item1['loss_mask']
input_ids1 = item1['input_ids']
# Find assistant response positions
assistant_positions1 = torch.where(loss_mask1 == 1)[0]
assert len(assistant_positions1) > 0, "No assistant positions found in loss mask"
# Decode and verify assistant responses
assistant_text1 = tokenizer.decode(input_ids1[loss_mask1 == 1])
print(f"Joke conversation assistant text: {assistant_text1}")
assert "chicken cross the road" in assistant_text1, "First assistant response not found"
assert "other side" in assistant_text1, "Second assistant response not found"
# Test 6: Attention Mask Pattern
attention_mask0 = item0['attention_mask']
sequence_length = torch.sum(attention_mask0)
assert sequence_length > 0, "No tokens marked as attended in attention mask"
assert torch.all(attention_mask0[:sequence_length] == 1), "Incorrect attention mask pattern"
if sequence_length < len(attention_mask0):
assert torch.all(attention_mask0[sequence_length:] == 0), "Padding not properly masked"
# Test 7: Position IDs Pattern
position_ids0 = item0['position_ids']
assert torch.equal(position_ids0[:sequence_length], torch.arange(sequence_length)), \
"Position IDs not sequential for non-padded tokens"
if sequence_length < len(position_ids0):
assert torch.all(position_ids0[sequence_length:] == 0), "Padding position IDs not zero"
# Test 8: Verify loss mask for assistant responses
# Get the full conversation text
full_text = tokenizer.decode(input_ids0)
print(f"\nFull conversation text:\n{full_text}")
# Get the assistant responses
assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 1])
print(f"\nAssistant responses (from loss mask):\n{assistant_text}")
# Verify that loss mask is set for all assistant responses
for msg in test_data['messages'][0]: # First conversation
if msg['role'] == 'assistant':
# The content should appear in the masked text
assert msg['content'] in assistant_text, \
f"Assistant message '{msg['content']}' not found in masked text"
# The content should NOT appear in the non-masked text
non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
assert msg['content'] not in non_assistant_text, \
f"Assistant message '{msg['content']}' found in non-assistant text"
# Test 9: Verify non-assistant parts have loss_mask=0
# Get non-assistant text
non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
print(f"\nNon-assistant text (from loss mask):\n{non_assistant_text}")
# Verify that system and user messages are in the non-assistant text
for msg in test_data['messages'][0]: # First conversation
if msg['role'] in ['system', 'user']:
assert msg['content'] in non_assistant_text, \
f"{msg['role'].title()} message '{msg['content']}' not found in non-assistant text"
# And verify they're NOT in the assistant text
assert msg['content'] not in assistant_text, \
f"{msg['role'].title()} message '{msg['content']}' found in assistant text"
# Test 10: Verify padding behavior
padding_config = {'max_length': 1024, 'truncation': 'error', 'multiturn': {'messages_key': 'messages'}}
small_dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=padding_config)
padded_item = small_dataset[0]
# Get actual sequence length (before padding)
actual_length = torch.sum(padded_item['attention_mask'])
# Verify padding tokens
assert torch.all(padded_item['input_ids'][actual_length:] == tokenizer.pad_token_id), \
"Padding tokens not set correctly"
assert torch.all(padded_item['attention_mask'][actual_length:] == 0), \
"Attention mask not set correctly for padding"
assert torch.all(padded_item['loss_mask'][actual_length:] == 0), \
"Loss mask not set correctly for padding"
print("All tests passed!")
print("Starting test...")
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from omegaconf import OmegaConf
def get_gsm8k_data():
# prepare test dataset
url = "https://github.com/eric-haibin-lin/verl-data/raw/refs/heads/main/gsm8k/train.parquet"
local_folder = os.path.expanduser('~/verl-data/gsm8k/')
local_path = os.path.join(local_folder, 'train.parquet')
os.makedirs(local_folder, exist_ok=True)
return local_path
def test_rl_dataset():
from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
from verl.utils import hf_tokenizer
tokenizer = hf_tokenizer('deepseek-ai/deepseek-coder-1.3b-instruct')
local_path = get_gsm8k_data()
config = OmegaConf.create({
"prompt_key": "prompt",
"max_prompt_length": 256,
"filter_overlong_prompts": True,
"filter_overlong_prompts_workers": 2,
})
dataset = RLHFDataset(data_files=local_path, tokenizer=tokenizer, config=config)
dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True, drop_last=True, collate_fn=collate_fn)
a = next(iter(dataloader))
from verl import DataProto
tensors = {}
non_tensors = {}
for key, val in a.items():
if isinstance(val, torch.Tensor):
tensors[key] = val
else:
non_tensors[key] = val
data_proto = DataProto.from_dict(tensors=tensors, non_tensors=non_tensors)
assert 'input_ids' in data_proto.batch
data = dataset[0]['input_ids']
output = tokenizer.batch_decode([data])[0]
print(f'type: type{output}')
print(f'\n\noutput: {output}')
def test_image_rl_data():
from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
from verl.utils import hf_tokenizer, hf_processor
tokenizer = hf_tokenizer('Qwen/Qwen2-VL-2B-Instruct')
processor = hf_processor('Qwen/Qwen2-VL-2B-Instruct')
config = OmegaConf.create({
"prompt_key": "prompt",
"max_prompt_length": 1024,
"filter_overlong_prompts": True,
"filter_overlong_prompts_workers": 2,
})
dataset = RLHFDataset(data_files=os.path.expanduser("~/data/geo3k/train.parquet"),
tokenizer=tokenizer,
config=config,
processor=processor)
dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True, drop_last=True, collate_fn=collate_fn)
a = next(iter(dataloader))
from verl import DataProto
tensors = {}
non_tensors = {}
for key, val in a.items():
if isinstance(val, torch.Tensor):
tensors[key] = val
else:
non_tensors[key] = val
data_proto = DataProto.from_dict(tensors=tensors, non_tensors=non_tensors)
assert 'multi_modal_data' in data_proto.non_tensor_batch
assert 'multi_modal_inputs' in data_proto.non_tensor_batch
data = dataset[0]['input_ids']
output = tokenizer.batch_decode([data])[0]
print(f'type: type{output}')
print(f'\n\noutput: {output}')
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from transformers import AutoTokenizer
from verl.utils import hf_tokenizer
from verl.utils.dataset.rm_dataset import RMDataset
def get_rm_data():
# prepare test dataset
url = "https://github.com/eric-haibin-lin/verl-data/raw/refs/heads/main/full_hh_rlhf/rm/test.parquet"
local_folder = os.path.expanduser('~/verl-data/full_hh_rlhf/rm/')
local_path = os.path.join(local_folder, 'test.parquet')
os.makedirs(local_folder, exist_ok=True)
return local_path
def test_rm_dataset():
tokenizer = hf_tokenizer("facebook/opt-1.3b")
local_path = get_rm_data()
dataset = RMDataset(parquet_files=local_path, tokenizer=tokenizer, max_length=512)
data = dataset[0]['input_ids']
output = tokenizer.batch_decode(data)
assert len(output) > 1
assert type(output[0]) == str
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from transformers import AutoTokenizer
from verl.utils import hf_tokenizer
from verl.utils.dataset.sft_dataset import SFTDataset
def get_gsm8k_data():
# prepare test dataset
url = "https://github.com/eric-haibin-lin/verl-data/raw/refs/heads/main/gsm8k/train.parquet"
local_folder = os.path.expanduser('~/verl-data/gsm8k/')
local_path = os.path.join(local_folder, 'train.parquet')
return local_path
def test_sft_cot_dataset():
tokenizer = hf_tokenizer('deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
local_path = get_gsm8k_data()
from omegaconf import OmegaConf
dataset = SFTDataset(parquet_files=local_path,
tokenizer=tokenizer,
config=OmegaConf.create({
'prompt_key': 'prompt',
'prompt_dict_keys': ['content'],
'response_key': 'extra_info',
'response_dict_keys': ['answer'],
'max_length': 512,
}))
data = dataset[0]['input_ids']
output = tokenizer.batch_decode([data])[0]
assert len(output) > 1
assert type(output) == str
def test_sft_dataset():
tokenizer = hf_tokenizer('deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct')
local_path = get_gsm8k_data()
from omegaconf import OmegaConf
dataset = SFTDataset(parquet_files=local_path,
tokenizer=tokenizer,
config=OmegaConf.create({
"prompt_key": 'extra_info',
'prompt_dict_keys': ['question'],
'response_key': 'extra_info',
'response_dict_keys': ['answer'],
'max_length': 512
}))
data = dataset[0]['input_ids']
output = tokenizer.batch_decode([data])[0]
assert len(output) > 1
assert type(output) == str
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import importlib.util
import pytest
from verl.utils.import_utils import load_extern_type
# Path to the test module
TEST_MODULE_PATH = os.path.join(os.path.dirname(__file__), "test_module.py")
def test_load_extern_type_class():
"""Test loading a class from an external file"""
TestClass = load_extern_type(TEST_MODULE_PATH, "TestClass")
# Verify the class was loaded correctly
assert TestClass is not None
assert TestClass.__name__ == "TestClass"
# Test instantiation and functionality
instance = TestClass()
assert instance.value == "default"
# Test with a custom value
custom_instance = TestClass("custom")
assert custom_instance.get_value() == "custom"
def test_load_extern_type_function():
"""Test loading a function from an external file"""
test_function = load_extern_type(TEST_MODULE_PATH, "test_function")
# Verify the function was loaded correctly
assert test_function is not None
assert callable(test_function)
# Test function execution
result = test_function()
assert result == "test_function_result"
def test_load_extern_type_constant():
"""Test loading a constant from an external file"""
constant = load_extern_type(TEST_MODULE_PATH, "TEST_CONSTANT")
# Verify the constant was loaded correctly
assert constant is not None
assert constant == "test_constant_value"
def test_load_extern_type_nonexistent_file():
"""Test behavior when file doesn't exist"""
with pytest.raises(FileNotFoundError):
load_extern_type("/nonexistent/path.py", "SomeType")
def test_load_extern_type_nonexistent_type():
"""Test behavior when type doesn't exist in the file"""
with pytest.raises(AttributeError):
load_extern_type(TEST_MODULE_PATH, "NonExistentType")
def test_load_extern_type_none_path():
"""Test behavior when file path is None"""
result = load_extern_type(None, "SomeType")
assert result is None
def test_load_extern_type_invalid_module():
"""Test behavior when module has syntax errors"""
# Create a temporary file with syntax errors
import tempfile
with tempfile.NamedTemporaryFile(suffix='.py', mode='w+', delete=False) as temp_file:
temp_file.write("This is not valid Python syntax :")
temp_path = temp_file.name
try:
with pytest.raises(RuntimeError):
load_extern_type(temp_path, "SomeType")
finally:
# Clean up the temporary file
if os.path.exists(temp_path):
os.remove(temp_path)
# Copyright 2025 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Test module for import_utils.load_extern_type testing
class TestClass:
"""A test class to be imported by load_extern_type"""
def __init__(self, value=None):
self.value = value or "default"
def get_value(self):
return self.value
TEST_CONSTANT = "test_constant_value"
def test_function():
return "test_function_result"
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
with open(os.path.join(version_folder, 'version/version')) as f:
__version__ = f.read().strip()
from .protocol import DataProto
from .utils.logging_utils import set_basic_config
import logging
set_basic_config(level=logging.WARNING)
from . import single_controller
__all__ = ['DataProto', "__version__"]
if os.getenv('VERL_USE_MODELSCOPE', 'False').lower() == 'true':
import importlib
if importlib.util.find_spec("modelscope") is None:
raise ImportError(f'You are using the modelscope hub, please install modelscope by `pip install modelscope -U`')
# Patch hub to download models from modelscope to speed up.
from modelscope.utils.hf_util import patch_hub
patch_hub()
# Models
Common modelzoo such as huggingface/transformers stuggles when using Pytorch native model parallelism. Following the design principle of vLLM, we keep a simple, parallelizable, highly-optimized with packed inputs in verl.
## Adding a New Huggingface Model
### Step 1: Copy the model file from HF to verl
- Add a new file under verl/models/hf
- Copy ONLY the model file from huggingface/transformers/models to verl/models/hf
### Step 2: Modify the model file to use packed inputs
- Remove all the code related to inference (kv cache)
- Modify the inputs to include only
- input_ids (total_nnz,)
- cu_seqlens (total_nnz + 1,)
- max_seqlen_in_batch: int
- Note that this requires using flash attention with causal mask.
### Step 2.5: Add tests
- Add a test to compare this version and the huggingface version
- Following the infrastructure and add tests to tests/models/hf
### Step 3: Add a function to apply tensor parallelism
- Please follow
- https://pytorch.org/docs/stable/distributed.tensor.parallel.html
- https://pytorch.org/tutorials/intermediate/TP_tutorial.html
- General comments
- Tensor Parallelism in native Pytorch is NOT auto-parallelism. The way it works is to specify how model parameters and input/output reshards using configs. These configs are then registered as hooks to perform input/output resharding before/after model forward.
### Step 4: Add a function to apply data parallelism
- Please use FSDP2 APIs
- See demo here https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py#L413
### Step 5: Add a function to apply pipeline parallelism
- Comes in Pytorch 2.4
- Currently only in alpha in nightly version
- Check torchtitan for more details
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .modeling_llama_megatron import (
# original model with megatron
ParallelLlamaModel,
ParallelLlamaForCausalLM,
# rmpad with megatron
ParallelLlamaForCausalLMRmPad,
ParallelLlamaForValueRmPad,
# rmpad with megatron and pipeline parallelism
ParallelLlamaForCausalLMRmPadPP,
ParallelLlamaForValueRmPadPP)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
from packaging.version import Version
import torch
import time
from typing import Dict, Any, Callable, Optional
import torch.distributed as dist
def _megatron_calc_layer_map(config):
"""Calculate the mapping of global layer_idx to local layer_idx
Returns:
layer_map (Dict: int -> tuple(int, int, int)):
mapping from the global layer index to
a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
"""
from megatron.core import mpu
print(f'get megatron data parallel size: {mpu.get_data_parallel_world_size()}')
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
layer_map = dict()
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
for pp_rank_idx in range(pp_size):
for virtual_pp_rank_idx in range(virtual_pp_size):
layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
pp_rank_idx * num_layers_per_model)
for layer_idx in range(num_layers_per_model):
layer_map[layer_offset + layer_idx] = (
pp_rank_idx,
virtual_pp_rank_idx,
layer_idx,
)
return layer_map
def load_state_dict_to_megatron_llama(state_dict,
wrapped_models,
config,
params_dtype,
is_value_model=False,
tie_word_embeddings=False):
"""Load merged state_dict to sharded Megatron module in training.
"""
from megatron.core import mpu
from verl.utils.megatron_utils import print_rank_0, unwrap_model
from megatron.core.transformer.module import Float16Module
from megatron.core import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
start_time = time.time()
def _get_gpt_model(model):
return model
def fetch_params(module):
for param in module.parameters():
torch.distributed.fetch(param.data,
src=mpu.get_data_parallel_src_rank(),
group=mpu.get_data_parallel_group())
dp_rank = mpu.get_data_parallel_rank()
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
mp_group = mpu.get_model_parallel_group()
if torch.distributed.get_rank() == 0:
assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
if not isinstance(wrapped_models, (list, tuple)):
wrapped_models = list(wrapped_models)
assert len(wrapped_models) == virtual_pp_size
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f'num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}'
models = [None] * len(wrapped_models)
for i, wrapped_model in enumerate(wrapped_models):
models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
gpt_model_module = _get_gpt_model(models[i])
assert len(gpt_model_module.model.layers) == num_layers_per_model
def _fetch_tensor(tensor, name) -> torch.Tensor:
"""fetch tensor"""
nonlocal state_dict
if tensor is not None:
tensor.data.copy_(state_dict[name])
def _fetch_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
"""fetch tensor in tp shards"""
nonlocal state_dict
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if name in state_dict:
full_weight = state_dict[name]
if mutate_func is not None:
full_weight = mutate_func(full_weight)
tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
if tensor is not None:
tensor.data.copy_(tensor_chunk[tp_rank])
else:
print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
def _fetch_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
"""fetch tensor in tp shards"""
nonlocal state_dict
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if name in state_dict:
full_weight = state_dict[name]
if mutate_func is not None:
full_weight = mutate_func(full_weight)
tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
if tensor is not None:
tensor.data.copy_(tensor_chunk[tp_rank])
else:
print(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
def _fetch_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
"""fetch gate_up tensor in tp shards"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if gate_name in state_dict and up_name in state_dict:
gate_weight = state_dict[gate_name]
up_weight = state_dict[up_name]
new_gate_up_weight = torch.empty(config.intermediate_size * 2,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
intermediate_size_tp = config.intermediate_size // tp_size
gate_weight_tp = gate_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
up_weight_tp = up_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
new_gate_up_weight[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)].copy_(
torch.cat([gate_weight_tp, up_weight_tp], dim=0))
tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
if tensor is not None:
tensor.data.copy_(tensor_chunk[tp_rank])
else:
print(f"tp_shard tensor:[{gate_name}, {up_name}] not in state_dict, skip loading")
def _fetch_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
"""fetch tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
assert (q_name in state_dict and k_name in state_dict and v_name in state_dict)
full_weight_q = state_dict[q_name]
full_weight_k = state_dict[k_name]
full_weight_v = state_dict[v_name]
hidden_size_per_head = config.hidden_size // config.num_attention_heads
if config.num_key_value_heads >= tp_size:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
total_size = q_size_tp + 2 * kv_size_tp
new_weight_qkv = torch.empty(total_size * tp_size,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
k_part = full_weight_k[i * kv_size_tp:(i + 1) * kv_size_tp]
v_part = full_weight_v[i * kv_size_tp:(i + 1) * kv_size_tp]
new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
else:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head
total_size = q_size_tp + 2 * kv_size_tp
new_weight_qkv = torch.empty(total_size * tp_size,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
k_part = full_weight_k[start_idx:end_idx]
v_part = full_weight_v[start_idx:end_idx]
new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part], dim=0))
tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
if tensor is not None:
tensor.data.copy_(tensor_chunk[tp_rank])
# Embeddings
# -------------------
print_rank_0("loading embeddings...")
gpt_model_module = _get_gpt_model(models[0])
embed_tokens_weight = None
if pp_rank == 0:
embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
_fetch_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
# Transformer layers
# -------------------
layer_map = _megatron_calc_layer_map(config)
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
num_layer_per_pp = config.num_hidden_layers // pp_size
vpp_size = mpu.get_virtual_pipeline_model_parallel_world_size()
layer_list = []
if vpp_size is not None:
for vpp_rank in range(vpp_size):
num_layer_vpp_chunk = num_layer_per_pp // vpp_size
num_layer_this_model = num_layer_vpp_chunk
offset = vpp_rank * (
config.num_hidden_layers // mpu.get_virtual_pipeline_model_parallel_world_size()) + \
(mpu.get_pipeline_model_parallel_rank() * num_layer_vpp_chunk)
layer_list.extend(list(range(offset, offset + num_layer_this_model)))
else:
num_layer_this_model = num_layer_per_pp
offset = pp_rank * num_layer_per_pp
layer_list.extend(list(range(offset, offset + num_layer_this_model)))
for layer in layer_list:
print_rank_0(f"loading layer #{layer}...")
layer_name = f"model.layers.{layer}"
dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
sync_layer = gpt_model_module.model.layers[dst_layer_idx]
_fetch_tensor(
sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.input_layernorm.weight",
)
_fetch_tp_shard_tensor_qkv(
sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.self_attn.q_proj.weight",
f"{layer_name}.self_attn.k_proj.weight",
f"{layer_name}.self_attn.v_proj.weight",
)
_fetch_tp_shard_tensor(
sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.self_attn.o_proj.weight",
chunk_dim=1,
)
_fetch_tensor(
sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.post_attention_layernorm.weight",
)
_fetch_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.mlp.gate_proj.weight", f"{layer_name}.mlp.up_proj.weight")
_fetch_tp_shard_tensor(
sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.mlp.down_proj.weight",
chunk_dim=1,
)
# Final Layernorm
# -------------------
print_rank_0("loading final layernorm...")
gpt_model_module = _get_gpt_model(models[-1])
_fetch_tensor(
getattr(gpt_model_module.model.norm, "weight", None),
"model.norm.weight",
)
print_rank_0("loading lm_head...")
if pp_rank + 1 == pp_size:
lm_head_weight = gpt_model_module.lm_head.weight
if is_value_model:
if 'lm_head.weight' in state_dict and state_dict['lm_head.weight'].shape[0] == 1:
_fetch_tensor(lm_head_weight, "lm_head.weight")
print_rank_0('load lm_head weight')
elif 'reward_head.weight' in state_dict and state_dict['reward_head.weight'].shape[0] == 1:
_fetch_tensor(lm_head_weight, "reward_head.weight")
print_rank_0('load lm_head from value_head weight')
else:
_fetch_tensor(None, "lm_head.weight")
print_rank_0('fail to match lm_head in value_model')
else:
_fetch_tp_shard_tensor(lm_head_weight, "lm_head.weight")
dist.barrier()
torch.cuda.empty_cache()
print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
from packaging.version import Version
import torch
import time
from typing import Dict, Any, Callable, Optional
import torch.distributed as dist
def _megatron_calc_layer_map(config):
"""Calculate the mapping of global layer_idx to local layer_idx
Returns:
layer_map (Dict: int -> tuple(int, int, int)):
mapping from the global layer index to
a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
"""
from megatron.core import mpu
print(f'get megatron data parallel size: {mpu.get_data_parallel_world_size()}')
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
layer_map = dict()
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
for pp_rank_idx in range(pp_size):
for virtual_pp_rank_idx in range(virtual_pp_size):
layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
pp_rank_idx * num_layers_per_model)
for layer_idx in range(num_layers_per_model):
layer_map[layer_offset + layer_idx] = (
pp_rank_idx,
virtual_pp_rank_idx,
layer_idx,
)
return layer_map
def load_state_dict_to_megatron_llama(state_dict,
wrapped_models,
config,
params_dtype,
is_value_model=False,
tie_word_embeddings=False):
"""Load merged state_dict to sharded Megatron module in training.
"""
from megatron.core import mpu
from verl.utils.megatron_utils import print_rank_0, unwrap_model
from megatron.core.transformer.module import Float16Module
from megatron.core import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
start_time = time.time()
def _get_gpt_model(model):
return model
def broadcast_params(module):
for param in module.parameters():
torch.distributed.broadcast(param.data,
src=mpu.get_data_parallel_src_rank(),
group=mpu.get_data_parallel_group())
dp_rank = mpu.get_data_parallel_rank()
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
mp_group = mpu.get_model_parallel_group()
if torch.distributed.get_rank() == 0:
assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
if not isinstance(wrapped_models, (list, tuple)):
wrapped_models = list(wrapped_models)
assert len(wrapped_models) == virtual_pp_size
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers, f'num_layers_per_model: {num_layers_per_model} * pp_size: {pp_size} * virtual_pp_size {virtual_pp_size} != config.num_hidden_layers: {config.num_hidden_layers}'
models = [None] * len(wrapped_models)
for i, wrapped_model in enumerate(wrapped_models):
models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
gpt_model_module = _get_gpt_model(models[i])
assert len(gpt_model_module.model.layers) == num_layers_per_model
def _broadcast_tensor(tensor, name) -> torch.Tensor:
"""broadcast tensor from rank0 across mp_group"""
nonlocal state_dict
nonlocal mp_group
if torch.distributed.get_rank() == 0:
if name in state_dict:
weight = state_dict[name]
tensor_shape = weight.shape
else:
tensor_shape = None
else:
weight = None
tensor_shape = None
obj_list = [tensor_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
tensor_shape = obj_list[0]
if tensor_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
return
if tensor is None:
tensor = torch.empty(
tensor_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
if torch.distributed.get_rank() == 0:
tensor.data.copy_(weight)
dist.broadcast(tensor, src=0, group=mp_group)
def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
if name in state_dict:
full_weight = state_dict[name]
if mutate_func is not None:
full_weight = mutate_func(full_weight)
tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
if name in state_dict:
full_weight = state_dict[name]
if mutate_func is not None:
full_weight = mutate_func(full_weight)
tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
gate_weight = state_dict[gate_name]
up_weight = state_dict[up_name]
new_gate_up_weight = torch.empty(config.intermediate_size * 2,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
intermediate_size_tp = config.intermediate_size // tp_size
gate_weight_tp = gate_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
up_weight_tp = up_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
new_gate_up_weight[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)].copy_(
torch.cat([gate_weight_tp, up_weight_tp], dim=0))
tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (
tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank() == 0:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
if torch.distributed.get_rank() == 0:
assert (q_name in state_dict and k_name in state_dict and v_name in state_dict)
full_weight_q = state_dict[q_name]
full_weight_k = state_dict[k_name]
full_weight_v = state_dict[v_name]
hidden_size_per_head = config.hidden_size // config.num_attention_heads
if config.num_key_value_heads >= tp_size:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
total_size = q_size_tp + 2 * kv_size_tp
new_weight_qkv = torch.empty(total_size * tp_size,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
k_part = full_weight_k[i * kv_size_tp:(i + 1) * kv_size_tp]
v_part = full_weight_v[i * kv_size_tp:(i + 1) * kv_size_tp]
new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
dim=0))
else:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head
total_size = q_size_tp + 2 * kv_size_tp
new_weight_qkv = torch.empty(total_size * tp_size,
config.hidden_size,
dtype=params_dtype,
device=torch.cuda.current_device())
for i in range(tp_size):
q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
k_part = full_weight_k[start_idx:end_idx]
v_part = full_weight_v[start_idx:end_idx]
new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
dim=0))
tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
chunk_shape = tensor_chunk[0].shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=0, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{q_name, k_name, v_name}] not in state_dict, skip loading")
return
if tensor is None:
sync_tensor = torch.empty(
chunk_shape,
dtype=params_dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
else:
assert (tensor.shape == chunk_shape
), f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
for i in range(tp_size):
if torch.distributed.get_rank() == 0:
sync_tensor.data.copy_(tensor_chunk[i])
dist.broadcast(sync_tensor, src=0, group=mp_group)
if (i == tp_rank) and (tensor is not None):
tensor.data.copy_(sync_tensor)
if dp_rank == 0:
# Embeddings
# -------------------
print_rank_0("loading embeddings...")
gpt_model_module = _get_gpt_model(models[0])
embed_tokens_weight = None
if pp_rank == 0:
embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
_broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
# Transformer layers
# -------------------
layer_map = _megatron_calc_layer_map(config)
for layer in range(config.num_hidden_layers):
print_rank_0(f"loading layer #{layer}...")
layer_name = f"model.layers.{layer}"
dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
sync_layer = gpt_model_module.model.layers[dst_layer_idx]
_broadcast_tensor(
sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.input_layernorm.weight",
)
_broadcast_tp_shard_tensor_qkv(
sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.self_attn.q_proj.weight",
f"{layer_name}.self_attn.k_proj.weight",
f"{layer_name}.self_attn.v_proj.weight",
)
_broadcast_tp_shard_tensor(
sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.self_attn.o_proj.weight",
chunk_dim=1,
)
_broadcast_tensor(
sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.post_attention_layernorm.weight",
)
_broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.mlp.gate_proj.weight", f"{layer_name}.mlp.up_proj.weight")
_broadcast_tp_shard_tensor(
sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
f"{layer_name}.mlp.down_proj.weight",
chunk_dim=1,
)
# Final Layernorm
# -------------------
print_rank_0("loading final layernorm...")
gpt_model_module = _get_gpt_model(models[-1])
_broadcast_tensor(
getattr(gpt_model_module.model.norm, "weight", None),
"model.norm.weight",
)
print_rank_0("loading lm_head...")
lm_head_weight = None
if pp_rank + 1 == pp_size:
lm_head_weight = gpt_model_module.lm_head.weight
if is_value_model:
if 'lm_head.weight' in state_dict and state_dict['lm_head.weight'].shape[0] == 1:
_broadcast_tensor(lm_head_weight, "lm_head.weight")
print_rank_0('load lm_head weight')
elif 'reward_head.weight' in state_dict and state_dict['reward_head.weight'].shape[0] == 1:
_broadcast_tensor(lm_head_weight, "reward_head.weight")
print_rank_0('load lm_head from value_head weight')
else:
_broadcast_tensor(None, "lm_head.weight")
print_rank_0('fail to match lm_head in value_model')
else:
_broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
dist.barrier()
# Broadcast weights inside data parallel groups
for wrapped_model in wrapped_models:
broadcast_params(wrapped_model)
torch.cuda.empty_cache()
print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
import torch.distributed as dist
from megatron.core import mpu
from megatron.core.distributed import DistributedDataParallel as LocalDDP
from megatron.core.transformer.module import Float16Module
from torch.nn.parallel import DistributedDataParallel as torchDDP
from verl.utils.megatron_utils import print_rank_0, unwrap_model
def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
"""given TP,DP,PP rank to get the global rank."""
tp_size = mpu.get_tensor_model_parallel_world_size()
dp_size = mpu.get_data_parallel_world_size()
pp_size = mpu.get_pipeline_model_parallel_world_size()
assert (tp_size * dp_size * pp_size == torch.distributed.get_world_size()
), f"{tp_size} x {dp_size} x {pp_size} != {torch.distributed.get_world_size()}"
# We only support TP-DP-PP grouping, for correctness when resharding
return (pp_rank * dp_size + dp_rank) * tp_size + tp_rank
def _megatron_calc_layer_map(config):
"""Calculate the mapping of global layer_idx to local layer_idx
Returns:
layer_map (Dict: int -> tuple(int, int, int)):
mapping from the global layer index to
a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
"""
from megatron.core import mpu
pp_size = mpu.get_pipeline_model_parallel_world_size()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
layer_map = dict()
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
for pp_rank_idx in range(pp_size):
for virtual_pp_rank_idx in range(virtual_pp_size):
layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
pp_rank_idx * num_layers_per_model)
for layer_idx in range(num_layers_per_model):
layer_map[layer_offset + layer_idx] = (
pp_rank_idx,
virtual_pp_rank_idx,
layer_idx,
)
return layer_map
def merge_megatron_ckpt_llama(wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False):
"""Merge sharded parameters of a Megatron module into a merged checkpoint.
Args:
wrapped_models (list of megatron.core.distributed.DistributedDataParallel):
The local DDP wrapped megatron modules.
config (str or None):
HF config for model
dtype: model params type
is_value_model: if model is value model
tie_word_embeddings: tie_word_embeddings, not used in llama, only to keep same interface with qwen2
Returns:
state_dict (dict):
The merged state_dict in rank 0, and an empty dictionary in other ranks.
"""
start_time = time.time()
def _get_gpt_model(model):
return model
dp_rank = mpu.get_data_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
pp_rank = mpu.get_pipeline_model_parallel_rank()
virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
mp_group = mpu.get_model_parallel_group()
if dist.get_rank() == 0:
assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
if not isinstance(wrapped_models, (list, tuple)):
wrapped_models = list(wrapped_models)
assert len(wrapped_models) == virtual_pp_size
num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
models = [None] * len(wrapped_models)
for i, wrapped_model in enumerate(wrapped_models):
models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
assert len(models[i].model.layers
) == num_layers_per_model, 'len model layers {} not equal to num_layers_per_model {}'.format(
len(models[i].model.layers), num_layers_per_model)
state_dict = dict()
def _get_cpu_tensor(tensor: torch.Tensor):
if tensor is None:
return None
if tensor.device == torch.device("cpu"):
return tensor.detach().clone()
return tensor.detach().cpu()
def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
"""broadcast tensor across mp_group"""
nonlocal state_dict
nonlocal mp_group
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
if tensor is None:
weight = None
tensor_shape = None
else:
weight = tensor
tensor_shape = weight.shape
else:
weight = None
tensor_shape = None
obj_list = [tensor_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
tensor_shape = obj_list[0]
if tensor_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tensor:[{name}] not exist, skip collect")
return
if weight is None:
weight = torch.empty(
tensor_shape,
dtype=dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
dist.broadcast(weight, src=src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
state_dict[name] = _get_cpu_tensor(weight)
def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
chunk_shape = tensor.shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
return
buffer_tensor = torch.empty(
chunk_shape,
dtype=dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
chunk_tensors = [None] * tp_size
for i in range(tp_size):
cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
if torch.distributed.get_rank() == 0:
full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
if mutate_func is not None:
full_tensor = mutate_func(full_tensor)
state_dict[name] = full_tensor
def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
chunk_shape = tensor.shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
return
buffer_tensor = torch.empty(
chunk_shape,
dtype=dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
chunk_tensors = [None] * tp_size
for i in range(tp_size):
cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
if torch.distributed.get_rank() == 0:
full_tensor = torch.concat(chunk_tensors, dim=0)
intermediate_size_tp = config.intermediate_size // tp_size
gate_weight_list = []
up_weight_list = []
for i in range(tp_size):
gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)]
gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
gate_weight_list.append(gate_weight_tp)
up_weight_list.append(up_weight_tp)
state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
state_dict[up_name] = torch.cat(up_weight_list, dim=0)
def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
"""broadcast tensor in tp shards across mp_group"""
nonlocal state_dict
nonlocal mp_group
tp_rank = mpu.get_tensor_model_parallel_rank()
tp_size = mpu.get_tensor_model_parallel_world_size()
src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
if torch.distributed.get_rank() == src_rank:
chunk_shape = tensor.shape
else:
chunk_shape = None
obj_list = [chunk_shape]
dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
chunk_shape = obj_list[0]
if chunk_shape is None:
# all or none ranks in the mp_group should reach here
print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
return
buffer_tensor = torch.empty(
chunk_shape,
dtype=dtype,
device=torch.cuda.current_device(),
requires_grad=False,
)
chunk_tensors = [None] * tp_size
for i in range(tp_size):
cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
if torch.distributed.get_rank() == 0:
chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
if torch.distributed.get_rank() == 0:
full_tensor = torch.concat(chunk_tensors, dim=0)
q_weight_list = []
k_weight_list = []
v_weight_list = []
hidden_size_per_head = config.hidden_size // config.num_attention_heads
if config.num_key_value_heads >= tp_size:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
total_size = q_size_tp + 2 * kv_size_tp
for i in range(tp_size):
qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
q_part = qkv_part[:q_size_tp]
k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
q_weight_list.append(q_part)
k_weight_list.append(k_part)
v_weight_list.append(v_part)
else:
q_size_tp = config.hidden_size // tp_size
kv_size_tp = hidden_size_per_head
total_size = q_size_tp + 2 * kv_size_tp
for i in range(tp_size):
qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
q_part = qkv_part[:q_size_tp]
k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
q_weight_list.append(q_part)
if i * config.num_key_value_heads % tp_size == 0:
k_weight_list.append(k_part)
v_weight_list.append(v_part)
state_dict[q_name] = torch.cat(q_weight_list, dim=0)
state_dict[k_name] = torch.cat(k_weight_list, dim=0)
state_dict[v_name] = torch.cat(v_weight_list, dim=0)
# empty cache before collecting weights
torch.cuda.empty_cache()
# Embeddings
# -------------------
if dp_rank == 0:
# Embeddings
# -------------------
print_rank_0("collecting embeddings...")
gpt_model_module = _get_gpt_model(models[0])
_broadcast_tp_shard_tensor(
gpt_model_module.model.embed_tokens.weight if pp_rank == 0 else None,
"model.embed_tokens.weight",
src_pp_rank=0,
)
# Transformer layers
# -------------------
layer_map = _megatron_calc_layer_map(config)
for layer in range(config.num_hidden_layers):
print_rank_0(f"collecting layer #{layer}...")
layer_name = f"model.layers.{layer}"
src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
sync_layer = gpt_model_module.model.layers[src_layer_idx]
_broadcast_tensor(
sync_layer.input_layernorm.weight,
f"{layer_name}.input_layernorm.weight",
src_pp_rank=src_pp_rank,
)
_broadcast_tp_shard_tensor_qkv(
sync_layer.self_attn.qkv_proj.weight,
f"{layer_name}.self_attn.q_proj.weight",
f"{layer_name}.self_attn.k_proj.weight",
f"{layer_name}.self_attn.v_proj.weight",
src_pp_rank=src_pp_rank,
)
_broadcast_tp_shard_tensor(
sync_layer.self_attn.o_proj.weight,
f"{layer_name}.self_attn.o_proj.weight",
concat_dim=1,
src_pp_rank=src_pp_rank,
)
_broadcast_tensor(
sync_layer.post_attention_layernorm.weight,
f"{layer_name}.post_attention_layernorm.weight",
src_pp_rank=src_pp_rank,
)
_broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight,
f"{layer_name}.mlp.gate_proj.weight",
f"{layer_name}.mlp.up_proj.weight",
src_pp_rank=src_pp_rank)
_broadcast_tp_shard_tensor(
sync_layer.mlp.down_proj.weight,
f"{layer_name}.mlp.down_proj.weight",
concat_dim=1,
src_pp_rank=src_pp_rank,
)
# Final Layernorm
# -------------------
print_rank_0("collecting final layernorm...")
gpt_model_module = _get_gpt_model(models[-1])
_broadcast_tensor(
getattr(gpt_model_module.model.norm, "weight", None),
"model.norm.weight",
src_pp_rank=pp_size - 1,
)
print_rank_0("collecting lm_head...")
if is_value_model:
if pp_rank == pp_size - 1:
print(f'gpt_model_module.lm_head.weight: {gpt_model_module.lm_head.weight.shape}')
_broadcast_tensor(gpt_model_module.lm_head.weight if pp_rank == pp_size - 1 else None,
"lm_head.weight",
src_pp_rank=pp_size - 1)
_broadcast_tensor(gpt_model_module.reward_head.weight if pp_rank == pp_size - 1 and
getattr(gpt_model_module, "reward_weight", None) is not None else None,
"reward_head.weight",
src_pp_rank=pp_size - 1)
else:
_broadcast_tp_shard_tensor(
getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
"lm_head.weight",
src_pp_rank=pp_size - 1,
)
dist.barrier()
torch.cuda.empty_cache()
if torch.distributed.get_rank() == 0:
if dtype not in [torch.float16, torch.bfloat16, torch.float32]:
print(f'Unknown/unsupported dtype to save: {dtype}"')
exit(1)
for k, v in state_dict.items():
if dtype != v.dtype:
state_dict[k] = v.to(dtype)
print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
return state_dict
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .parallel_attention import ParallelLlamaAttention
from .parallel_decoder import ParallelLlamaDecoderLayer, ParallelLlamaDecoderLayerRmPad
from .parallel_mlp import ParallelLlamaMLP
from .parallel_rmsnorm import ParallelLlamaRMSNorm
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Optional, Tuple
import torch
from megatron.core import parallel_state as mpu
from megatron.core import tensor_parallel
from megatron.core import ModelParallelConfig
from torch import nn
from transformers import LlamaConfig
from verl.models.llama.megatron.layers.parallel_linear import QKVParallelLinear
from verl.utils.megatron import tensor_parallel as tp_utils
class LlamaRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(seq_len=max_position_embeddings,
device=self.inv_freq.device,
dtype=torch.get_default_dtype())
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
t = t / self.scaling_factor
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
if seq_len > self.max_position_embeddings:
base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
(self.scaling_factor - 1))**(self.dim / (self.dim - 2))
inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
class LlamaLlama3ScalingRotaryEmbedding(LlamaRotaryEmbedding):
def __init__(self, dim, config, max_position_embeddings=2048, base=10000, device=None):
super().__init__(dim, max_position_embeddings, base, device)
self.factor = config.rope_scaling["factor"] # `8` in the original implementation
self.high_freq_factor = config.rope_scaling["high_freq_factor"] # `1` in the original implementation
self.low_freq_factor = config.rope_scaling["low_freq_factor"] # `4` in the original implementation
self.old_context_len = config.rope_scaling[
"original_max_position_embeddings"] # `8192` in the original implementation
low_freq_wavelen = self.old_context_len / self.low_freq_factor
high_freq_wavelen = self.old_context_len / self.high_freq_factor
wavelen = 2 * math.pi / self.inv_freq
# wavelen < high_freq_wavelen: do nothing; wavelen > low_freq_wavelen: divide by factor
inv_freq_llama = torch.where(wavelen > low_freq_wavelen, self.inv_freq / self.factor, self.inv_freq)
# otherwise: interpolate between the two, using a smooth factor
smooth_factor = (self.old_context_len / wavelen - self.low_freq_factor) / (self.high_freq_factor -
self.low_freq_factor)
smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / self.factor + smooth_factor * inv_freq_llama
is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
inv_freq = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
self.register_buffer("inv_freq", inv_freq, persistent=False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(seq_len=max_position_embeddings,
device=self.inv_freq.device,
dtype=torch.get_default_dtype())
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., :x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
class ParallelLlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
super().__init__()
self.config = config
self.megatron_config = megatron_config
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
# assign values after tp
tp_size = mpu.get_tensor_model_parallel_world_size()
assert self.num_heads % tp_size == 0, f'num_head must be divisible by tp_size. Got num_head={self.num_heads}, tp_size={tp_size}'
assert self.num_key_value_heads % tp_size == 0, \
f'num_key_value_heads must be divisible by tp_size. Got num_key_value_heads={self.num_key_value_heads}, tp_size={tp_size}'
self.num_heads_per_tp = self.num_heads // tp_size
self.num_key_value_heads_per_tp = self.num_key_value_heads // tp_size
self.hidden_size_per_tp = self.hidden_size // tp_size
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads}).")
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
if megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
assert row_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
# [self.q_size, self.k_size, self.v_size]
self.qkv_proj = QKVParallelLinear(input_size=self.hidden_size,
num_heads=self.num_heads,
num_key_value_heads=self.num_key_value_heads,
head_dim=self.head_dim,
bias=config.attention_bias,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
self.q_size = self.num_heads_per_tp * self.head_dim
self.k_size = self.num_key_value_heads_per_tp * self.head_dim
self.v_size = self.num_key_value_heads_per_tp * self.head_dim
self.o_proj = tensor_parallel.RowParallelLinear(input_size=self.num_heads * self.head_dim,
output_size=self.hidden_size,
bias=config.attention_bias,
input_is_parallel=True,
skip_bias_add=False,
**row_kwargs)
self._init_rope()
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = LlamaRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
rope_type_key = "type" if "type" in self.config.rope_scaling else "rope_type"
scaling_type = self.config.rope_scaling[rope_type_key]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "llama3":
self.rotary_emb = LlamaLlama3ScalingRotaryEmbedding(
self.head_dim,
self.config,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
qkv = self.qkv_proj(hidden_states)[0]
query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
query_states = query_states.view(bsz, q_len, self.num_heads_per_tp, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
kv_seq_len = key_states.shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads_per_tp, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz, self.num_heads_per_tp, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}")
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}")
attn_weights = attn_weights + attention_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads_per_tp, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads_per_tp, q_len, self.head_dim)}, but is"
f" {attn_output.size()}")
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size_per_tp)
attn_output = self.o_proj(attn_output)[0]
return attn_output
"""
Remove padding Attention
- Using Flash-attn 2
- Compatible with sequence parallel
"""
from transformers.utils import is_flash_attn_2_available
import torch.nn.functional as F
from einops import rearrange
if is_flash_attn_2_available():
from flash_attn import flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
def apply_rotary_pos_emb_rmpad(q, k, cos, sin, position_ids, indices, sequence_length):
batch_size = position_ids.shape[0]
q = pad_input(q, indices, batch_size, sequence_length) # (batch_size, seqlen, num_head, head_dim)
k = pad_input(k, indices, batch_size, sequence_length)
cos = cos[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
sin = sin[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
q_embed = index_first_axis(rearrange(q_embed, "b s ... -> (b s) ..."), indices)
k_embed = index_first_axis(rearrange(k_embed, "b s ... -> (b s) ..."), indices)
return q_embed, k_embed
from flash_attn.layers.rotary import apply_rotary_emb
# use flash-attn rotary embeddings with rmpad
# cos/sin shoudl be: (seq_length, rotary_dim / 2)
def apply_rotary_pos_emb_rmpad_flash(q, k, cos, sin, cu_seqlens, max_seqlen):
q_embed = apply_rotary_emb(q,
cos,
sin,
interleaved=False,
inplace=False,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen)
k_embed = apply_rotary_emb(k,
cos,
sin,
interleaved=False,
inplace=False,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen)
return q_embed, k_embed
class ParallelLlamaAttentionRmPad(ParallelLlamaAttention):
def forward(self,
hidden_states: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: torch.Tensor = None,
max_seqlen_in_batch: int = None):
total_nnz, _, _ = hidden_states.size() # This is the total_nnz padded after sequence parallel
if self.megatron_config.sequence_parallel:
total_nnz = total_nnz * mpu.get_tensor_model_parallel_world_size()
qkv = self.qkv_proj(hidden_states)[0]
query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size],
dim=-1) # (total_nnz, 1, hidden_size)
if self.megatron_config.sequence_parallel:
sequence_parallel_pad = total_nnz - cu_seqlens[-1]
total_nnz = cu_seqlens[-1] # total_nnz before sp padding
query_states = query_states[:total_nnz]
key_states = key_states[:total_nnz]
value_states = value_states[:total_nnz]
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dime x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(total_nnz, self.num_heads_per_tp, self.head_dim)
key_states = key_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
value_states = value_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
cos, sin = self.rotary_emb(value_states, seq_len=sequence_length)
cos, sin = cos[:, :cos.shape[1] // 2], sin[:, :sin.shape[1] // 2] # flash attn only needs half
query_states, key_states = apply_rotary_pos_emb_rmpad_flash(query_states,
key_states,
cos,
sin,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen_in_batch)
# query_states, key_states = apply_rotary_pos_emb_rmpad(query_states, key_states, cos, sin, position_ids, indices,
# TODO: llama does not have dropout in the config??
# It is recommended to use dropout with FA according to the docs
# when training.
dropout_rate = 0.0 # if not self.training else self.attn_dropout
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in float16 just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (LlamaRMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
query_states = query_states.to(torch.float16)
key_states = key_states.to(torch.float16)
value_states = value_states.to(torch.float16)
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens,
cu_seqlens_k=cu_seqlens,
max_seqlen_q=max_seqlen_in_batch,
max_seqlen_k=max_seqlen_in_batch,
dropout_p=dropout_rate,
softmax_scale=None,
causal=True,
)
attn_output_unpad = attn_output_unpad.to(input_dtype)
attn_output_unpad = attn_output_unpad.reshape(total_nnz, 1, self.hidden_size_per_tp).contiguous()
# sequence parallel reduce_scatter is performed inside RowColumnParallel if enabled
# Here we need to repad
if self.megatron_config.sequence_parallel:
attn_output_unpad = F.pad(attn_output_unpad, pad=(0, 0, 0, 0, 0, sequence_parallel_pad))
attn_output_unpad = self.o_proj(attn_output_unpad)[0]
return attn_output_unpad
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple
import torch
from torch import nn
from transformers import LlamaConfig
from megatron.core import ModelParallelConfig
from .parallel_attention import ParallelLlamaAttention, ParallelLlamaAttentionRmPad
from .parallel_mlp import ParallelLlamaMLP
from .parallel_rmsnorm import ParallelLlamaRMSNorm
from verl.utils.megatron_utils import TransformerConfig, convert_config
class ParallelLlamaDecoderLayer(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, layer_idx: int):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.layer_idx = layer_idx
self.hidden_size = config.hidden_size
self.self_attn = ParallelLlamaAttention(config=config, megatron_config=megatron_config)
self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Note: sequence parallel is hidden inside ColumnParallelLinear
# reduce scatter is hidden inside RowParallelLinear
# Self Attention
hidden_states = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
)
# TODO: add sequence parallel operator reduce_scatter here
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
# TODO: add sequence parallel operator all_gather here
hidden_states = self.mlp(hidden_states)
# TODO: add sequence parallel operator reduce_scatter here
hidden_states = residual + hidden_states
outputs = hidden_states
return outputs
class ParallelLlamaDecoderLayerRmPad(nn.Module):
def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, layer_idx: int):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.layer_idx = layer_idx
self.hidden_size = config.hidden_size
self.self_attn = ParallelLlamaAttentionRmPad(config=config, megatron_config=megatron_config)
self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
def forward(
self,
hidden_states: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: int = None,
max_seqlen_in_batch: int = None
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states # (total_nnz // sp, 1, hidden_size)
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
# (total_nnz // sp, 1, hidden_size) -> all-gather (total_nnz, 1, hidden_size)
# -> col + row -> reduce-scatter -> (total_nnz // sp, 1, hidden_size)
hidden_states = self.self_attn(hidden_states=hidden_states,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = residual + hidden_states
# Fully Connected
# shape changes same as attn
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = hidden_states
return outputs
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
from typing import Optional, Tuple
from megatron.core import tensor_parallel
class QKVParallelLinear(tensor_parallel.ColumnParallelLinear):
def __init__(self,
input_size,
num_heads,
num_key_value_heads,
head_dim,
*,
bias=True,
gather_output=True,
skip_bias_add=False,
**kwargs):
# Keep input parameters, and already restrict the head numbers
self.input_size = input_size
self.q_output_size = num_heads * head_dim
self.kv_output_size = num_key_value_heads * head_dim
self.head_dim = head_dim
self.gather_output = gather_output
self.skip_bias_add = skip_bias_add
input_size = self.input_size
output_size = (num_heads + 2 * num_key_value_heads) * self.head_dim
super().__init__(input_size=input_size,
output_size=output_size,
bias=bias,
gather_output=gather_output,
skip_bias_add=skip_bias_add,
**kwargs)
class MergedColumnParallelLinear(tensor_parallel.ColumnParallelLinear):
def __init__(self,
input_size,
gate_ouput_size,
up_output_size,
*,
bias=True,
gather_output=True,
skip_bias_add=False,
**kwargs):
# Keep input parameters, and already restrict the head numbers
self.input_size = input_size
self.output_size = gate_ouput_size + up_output_size
self.gather_output = gather_output
self.skip_bias_add = skip_bias_add
super().__init__(input_size=self.input_size,
output_size=self.output_size,
bias=bias,
gather_output=gather_output,
skip_bias_add=skip_bias_add,
**kwargs)
import torch
class LinearForLastLayer(torch.nn.Linear):
def __init__(
self,
input_size,
output_size,
*,
config,
bias=True,
):
super().__init__(in_features=input_size, out_features=output_size, bias=bias)
self.sequence_parallel = config.sequence_parallel
if self.sequence_parallel:
setattr(self.weight, 'sequence_parallel', True)
def forward(
self,
input_,
weight=None,
runtime_gather_output=None,
):
logits = super().forward(input_)
logits = logits.float()
if self.sequence_parallel:
logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
return logits, None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment