Commit f87b35b2 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2648 failed with stages
in 0 seconds
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Qwen2 model."""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from megatron.core import tensor_parallel, parallel_state
from megatron.core import ModelParallelConfig
from megatron.core import mpu
from torch import nn
from transformers.modeling_outputs import BaseModelOutputWithPast
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
from transformers.models.qwen2.modeling_qwen2 import CausalLMOutputWithPast
from verl.utils.megatron import sequence_parallel as sp_utils
from verl.utils.megatron import tensor_parallel as tp_utils
from verl.utils.megatron_utils import TransformerConfig, convert_config
from .layers import ParallelQwen2DecoderLayer, ParallelQwen2RMSNorm, ParallelQwen2DecoderLayerRmPad
"""
TODO:
1. Add weight initialization. Here we need to be careful on TP weight init.
2. Add sequence parallel
3. Load checkpoint from Qwen2 pretrained checkpoint
"""
# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
mask_cond = torch.arange(mask.size(-1), device=device)
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
mask = mask.to(dtype)
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
class ParallelQwen2Model(nn.Module):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
Args:
config: Qwen2Config
"""
def __init__(self, config: Qwen2Config, megatron_config: ModelParallelConfig):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
if megatron_config is not None:
assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(embedding_kwargs, megatron_config)
self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
**embedding_kwargs)
self.layers = nn.ModuleList(
[ParallelQwen2DecoderLayer(config, megatron_config) for _ in range(config.num_hidden_layers)])
self.norm = ParallelQwen2RMSNorm(config, megatron_config)
# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds):
# create causal mask
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask = None
if input_shape[-1] > 1:
combined_attention_mask = _make_causal_mask(
input_shape,
inputs_embeds.dtype,
device=inputs_embeds.device,
)
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype,
tgt_len=input_shape[-1]).to(inputs_embeds.device)
combined_attention_mask = (expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask +
combined_attention_mask)
return combined_attention_mask
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Args:
input_ids: input ids. shape (batch_size, seq_length)
attention_mask: attention_mask. shape (batch_size, seq_length)
position_ids: position ids. shape (batch_size, seq_length)
Returns:
"""
batch_size, seq_length = input_ids.shape
inputs_embeds = self.embed_tokens(input_ids)
# embed positions
attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), inputs_embeds)
hidden_states = inputs_embeds
for idx, decoder_layer in enumerate(self.layers):
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
)
hidden_states = layer_outputs
hidden_states = self.norm(hidden_states)
return hidden_states
class ParallelQwen2ForCausalLM(nn.Module):
def __init__(self, config: Qwen2Config, megatron_config: ModelParallelConfig):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.model = ParallelQwen2Model(config, megatron_config=megatron_config)
self.vocab_size = config.vocab_size
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=config.hidden_size,
output_size=config.vocab_size,
bias=False,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
```"""
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
)
hidden_states = outputs
logits = self.lm_head(hidden_states)[0]
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
logits = logits.float()
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
class ParallelQwen2ModelRmPad(nn.Module):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
Args:
config: Qwen2Config
"""
def __init__(self, config: Qwen2Config, megatron_config: ModelParallelConfig):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
self.megatron_config = megatron_config
if megatron_config is not None:
assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
**embedding_kwargs)
self.layers = nn.ModuleList(
[ParallelQwen2DecoderLayerRmPad(config, megatron_config) for _ in range(config.num_hidden_layers)])
self.norm = ParallelQwen2RMSNorm(config, megatron_config)
def forward(self,
input_ids: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: int = None,
max_seqlen_in_batch: int = None) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Args:
input_ids: input ids. shape (1, totol_nnz)
position_ids: position ids. shape (batch_size, seq_length)
Returns:
"""
inputs_embeds = self.embed_tokens(input_ids) # (1, total_nnz) -> (1, total_nnz, hidden_size)
# (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
inputs_embeds = inputs_embeds.transpose(0, 1)
if self.megatron_config.sequence_parallel:
inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
hidden_states = inputs_embeds
for idx, decoder_layer in enumerate(self.layers):
layer_outputs = decoder_layer(hidden_states,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = layer_outputs
hidden_states = self.norm(hidden_states)
return hidden_states
class ParallelQwen2ForCausalLMRmPad(nn.Module):
def __init__(self, config: Qwen2Config, megatron_config: ModelParallelConfig):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.megatron_config = megatron_config
self.model = ParallelQwen2ModelRmPad(config, megatron_config=megatron_config)
self.vocab_size = config.vocab_size
self._init_head(config)
def _init_head(self, config: Qwen2Config):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=config.hidden_size,
output_size=config.vocab_size,
bias=False,
gather_output=False,
skip_bias_add=False,
**column_kwargs)
def _forward_head(self, hidden_states):
# all_gather from sequence parallel region is performed inside lm_head
logits = self.lm_head(hidden_states)[0]
logits = logits.float() # (total_nnz_padded, 1, vocab_size // tp)
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) # (total_nnz_padded, 1, vocab_size)
return logits
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
```"""
batch_size, sequence_length = input_ids.shape
# remove padding here
input_ids, indices, cu_seqlens, max_seqlen_in_batch, *_ = unpad_input(input_ids.unsqueeze(dim=-1),
attention_mask) # (total_nnz, 1)
# pad input_ids to multiple of tp for all tp ranks
# TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
if self.megatron_config.sequence_parallel:
input_ids = sp_utils.pad_to_sequence_parallel(input_ids)
input_ids = input_ids.transpose(0, 1) # (1, total_nnz+pad)
outputs = self.model(input_ids=input_ids,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = outputs
logits = self._forward_head(hidden_states)
# remove padding from sequence parallel
if self.megatron_config.sequence_parallel:
totol_nnz = cu_seqlens[-1]
logits = logits[:totol_nnz] # (total_nnz_padded)
logits = torch.squeeze(logits, dim=1) # remove the artificial batch dimension
# add removed padding back
logits = pad_input(logits, indices, batch_size,
seqlen=sequence_length) # (batch_size, sequence_length, vocab_size)
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
class ParallelQwen2ForValueRmPad(ParallelQwen2ForCausalLMRmPad):
def _init_head(self, config):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = nn.Linear(in_features=config.hidden_size, out_features=1, bias=False)
# lm_head is effectively the same as sequence parallel
sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
def _forward_head(self, hidden_states):
logits = self.lm_head(hidden_states) # (total_nnz_padded // tp, 1, 1)
logits = logits.float()
if self.megatron_config.sequence_parallel:
logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
return logits
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
output = super().forward(input_ids, attention_mask, position_ids)
output.logits = torch.squeeze(output.logits, dim=-1)
return output
"""
Support pipeline parallelism
"""
class ParallelQwen2ModelRmPadPP(nn.Module):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
This model definition supports pipeline parallelism. To support pp and vpp,
- This model only contains layer in this pp stage and vpp chunk
- When calling get_model in Megatron, this rank will instantiate all the vpp chunks in this pp.
Args:
config: Qwen2Config
"""
def __init__(self, config: Qwen2Config, megatron_config: ModelParallelConfig, pre_process, post_process):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.pre_process = pre_process
self.post_process = post_process
self.megatron_config = megatron_config
embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
if megatron_config is not None:
assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
if pre_process:
self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
embedding_dim=config.hidden_size,
**embedding_kwargs)
else:
self.embed_tokens = None
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = megatron_config.pipeline_model_parallel_size
self.num_layer_per_pp = config.num_hidden_layers // pp_size
vpp_size = megatron_config.virtual_pipeline_model_parallel_size
vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank()
if vpp_size is not None:
self.num_layer_vpp_chunk = self.num_layer_per_pp // vpp_size
self.num_layer_this_model = self.num_layer_vpp_chunk
offset = vpp_rank * (
config.num_hidden_layers // vpp_size) + \
(pp_rank * self.num_layer_vpp_chunk)
else:
self.num_layer_this_model = self.num_layer_per_pp
offset = pp_rank * self.num_layer_per_pp
self.layers = nn.ModuleList()
for i in range(self.num_layer_this_model):
layer = ParallelQwen2DecoderLayerRmPad(config, megatron_config, layer_idx=i + offset)
self.layers.add_module(f'{i}', layer)
if post_process:
self.norm = ParallelQwen2RMSNorm(config, megatron_config)
else:
self.norm = None
def set_input_tensor(self, input_tensor):
"""Set input tensor to be used instead of forward()'s input.
When doing pipeline parallelism the input from the previous
stage comes from communication, not from the input, so the
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
self.input_tensor = input_tensor
def forward(self,
input_ids: torch.Tensor,
position_ids: Optional[torch.LongTensor] = None,
sequence_length: int = None,
indices: torch.Tensor = None,
cu_seqlens: int = None,
max_seqlen_in_batch: int = None) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Args:
input_ids: input ids. shape (1, totol_nnz)
position_ids: position ids. shape (batch_size, seq_length)
Returns:
"""
if self.pre_process:
inputs_embeds = self.embed_tokens(input_ids) # (1, total_nnz) -> (1, total_nnz, hidden_size)
# vocab parallel embedding will not do sequence parallel reduce-scatter in open source megatron
# so need to deal with it by handle here:
# (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
inputs_embeds = inputs_embeds.transpose(0, 1)
if self.megatron_config.sequence_parallel:
inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
hidden_states = inputs_embeds
else:
# self.hidden_states should be passed by Megatron
hidden_states = self.input_tensor
for idx, decoder_layer in enumerate(self.layers):
layer_outputs = decoder_layer(hidden_states,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
hidden_states = layer_outputs
if self.post_process:
hidden_states = self.norm(hidden_states)
return hidden_states
class ParallelQwen2ForCausalLMRmPadPP(nn.Module):
def __init__(self, config: Qwen2Config, megatron_config: ModelParallelConfig, pre_process, post_process,
share_embeddings_and_output_weights):
super().__init__()
self.config: TransformerConfig = convert_config(config, megatron_config)
self.megatron_config = megatron_config
self.model = ParallelQwen2ModelRmPadPP(config,
megatron_config=megatron_config,
pre_process=pre_process,
post_process=post_process)
self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
self.vocab_size = config.vocab_size
self.pre_process = pre_process
self.post_process = post_process
if post_process:
self._init_head(config)
if pre_process or post_process:
self.setup_embeddings_and_output_layer()
def set_input_tensor(self, input_tensor):
"""Set input tensor to be used instead of forward()'s input.
When doing pipeline parallelism the input from the previous
stage comes from communication, not from the input, so the
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
assert len(input_tensor) == 1
self.model.set_input_tensor(input_tensor[0])
def _init_head(self, config):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=config.hidden_size,
output_size=config.vocab_size,
bias=False,
gather_output=False,
skip_bias_add=False,
skip_weight_param_allocation=self.pre_process and
self.share_embeddings_and_output_weights,
**column_kwargs)
def setup_embeddings_and_output_layer(self) -> None:
"""Sets up embedding layer in first stage and output layer in last stage.
This function initalizes word embeddings in the final stage when we are
using pipeline parallelism and sharing word embeddings, and sets up param
attributes on the embedding and output layers.
"""
# Set `is_embedding_or_output_parameter` attribute.
if self.pre_process:
self.model.embed_tokens.weight.is_embedding_or_output_parameter = True
if self.post_process and self.lm_head.weight is not None:
self.lm_head.weight.is_embedding_or_output_parameter = True
if not self.share_embeddings_and_output_weights:
return
if parallel_state.get_pipeline_model_parallel_world_size() == 1:
# Zero out wgrad if sharing embeddings between two layers on same
# pipeline stage to make sure grad accumulation into main_grad is
# correct and does not include garbage values (e.g., from torch.empty).
self.shared_embedding_or_output_weight().zero_out_wgrad = True
return
if parallel_state.is_pipeline_first_stage() and self.pre_process and not self.post_process:
self.shared_embedding_or_output_weight().shared_embedding = True
if self.post_process and not self.pre_process:
assert not parallel_state.is_pipeline_first_stage()
# set word_embeddings weights to 0 here, then copy first
# stage's weights using all_reduce below.
self.lm_head.weight.data.fill_(0)
self.lm_head.weight.shared = True
self.lm_head.weight.shared_embedding = True
if torch.distributed.is_initialized():
if parallel_state.is_rank_in_embedding_group():
weight = self.shared_embedding_or_output_weight()
weight.data = weight.data.cuda()
torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group())
def shared_embedding_or_output_weight(self) -> torch.Tensor:
if self.pre_process:
return self.model.embed_tokens.weight
elif self.post_process:
return self.lm_head.weight
return None
def _forward_head(self, hidden_states):
# all_gather from sequence parallel region is performed inside lm_head
# print(f'logits shape before forward_head: {hidden_states.shape}, vocab_size = {self.config.vocab_size}') # [4, 32, 4096]
output_weight = None
if self.share_embeddings_and_output_weights:
output_weight = self.shared_embedding_or_output_weight()
logits = self.lm_head(hidden_states, weight=output_weight)[0]
# print(f'logits shape after forward_head: {logits.shape}') # [8, 32, 8]
logits = logits.float() # (total_nnz_padded, 1, vocab_size // tp)
return logits
def forward(
self,
# original input
*,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
```"""
# Note that input_ids, attention_mask and position_ids should be passed to every pp layer.
# In the first pp, input_ids will be used, in other pp layers hidden_states will be used inside self.model
batch_size, sequence_length = input_ids.shape
# remove padding here
input_ids_rmpad, indices, cu_seqlens, max_seqlen_in_batch, *_ = unpad_input(input_ids.unsqueeze(dim=-1),
attention_mask) # (total_nnz, 1)
# pad input_ids to multiple of tp for all tp ranks
# TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
if self.megatron_config.sequence_parallel:
input_ids_rmpad = sp_utils.pad_to_sequence_parallel(input_ids_rmpad)
input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz+pad)
outputs = self.model(input_ids=input_ids_rmpad,
position_ids=position_ids,
sequence_length=sequence_length,
indices=indices,
cu_seqlens=cu_seqlens,
max_seqlen_in_batch=max_seqlen_in_batch)
if self.post_process:
hidden_states = outputs
logits = self._forward_head(hidden_states)
logits = torch.squeeze(logits, dim=1) # remove the artificial batch dimension # torch.Size([8, 32, 16])
# remove padding from sequence parallel
if self.megatron_config.sequence_parallel:
totol_nnz = cu_seqlens[-1]
logits = logits[:totol_nnz] # (total_nnz_padded)
# add removed padding back. If input is already rmpad, we let the caller pad_input
logits = pad_input(logits, indices, batch_size,
seqlen=sequence_length) # (batch_size, sequence_length, vocab_size)
return CausalLMOutputWithPast(
loss=None,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
else:
return outputs
class ParallelQwen2ForValueRmPadPP(ParallelQwen2ForCausalLMRmPadPP):
def _init_head(self, config):
column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
if self.megatron_config is not None:
assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
self.lm_head = nn.Linear(in_features=config.hidden_size, out_features=1, bias=False)
# lm_head is effectively the same as sequence parallel
sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
def _forward_head(self, hidden_states):
logits = self.lm_head(hidden_states) # (total_nnz_padded // tp, 1, 1)
logits = logits.float()
if self.megatron_config.sequence_parallel:
logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
return logits
def forward(
self,
*,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
output = super().forward(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
if self.post_process:
output.logits = torch.squeeze(output.logits, dim=-1)
return output
else:
return output
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
from typing import List, Optional, Type
import torch.nn as nn
# Supported models in Megatron-LM
# Architecture -> (module, class).
_MODELS = {
"LlamaForCausalLM":
("llama", ("ParallelLlamaForCausalLMRmPadPP", "ParallelLlamaForValueRmPadPP", "ParallelLlamaForCausalLMRmPad")),
"Qwen2ForCausalLM":
("qwen2", ("ParallelQwen2ForCausalLMRmPadPP", "ParallelQwen2ForValueRmPadPP", "ParallelQwen2ForCausalLMRmPad")),
"MistralForCausalLM": ("mistral", ("ParallelMistralForCausalLMRmPadPP", "ParallelMistralForValueRmPadPP",
"ParallelMistralForCausalLMRmPad"))
}
# return model class
class ModelRegistry:
@staticmethod
def load_model_cls(model_arch: str, value=False) -> Optional[Type[nn.Module]]:
if model_arch not in _MODELS:
return None
megatron = "megatron"
module_name, model_cls_name = _MODELS[model_arch]
if not value: # actor/ref
model_cls_name = model_cls_name[0]
elif value: # critic/rm
model_cls_name = model_cls_name[1]
module = importlib.import_module(f"verl.models.{module_name}.{megatron}.modeling_{module_name}_megatron")
return getattr(module, model_cls_name, None)
@staticmethod
def get_supported_archs() -> List[str]:
return list(_MODELS.keys())
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from typing import Optional, Tuple, Callable
import sys
if sys.version_info >= (3, 11):
from typing import Unpack
else:
from typing_extensions import Unpack
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
from transformers.cache_utils import Cache
from transformers.utils import logging
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from verl.utils.ulysses import gather_heads_scatter_seq, gather_seq_scatter_heads, \
get_ulysses_sequence_parallel_world_size, validate_ulysses_config
logger = logging.get_logger(__name__)
def llama_flash_attn_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""
Adapted from transformers 4.47.1 to support Ulysses sequence parallelism.
NOTE: This function is used for transformers versions in the range [4.45.0, 4.47.1].
"""
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dim x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
# trade off: repeat first and then all to all
# key_states = repeat_kv(key_states, self.num_key_value_groups)
# value_states = repeat_kv(value_states, self.num_key_value_groups)
########## AlltoAll for Ulysses ##########
ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
if ulysses_sp_size > 1:
validate_ulysses_config(self.num_heads, ulysses_sp_size)
# (bsz, n_head, seq_len/n, head_dim) -> (bsz, n_head/n, seq_len, head_dim)
query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
full_q_len = query_states.size(2) # full seq length
if position_embeddings is None:
logger.warning_once(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory.")
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (LlamaRMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}.")
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
full_q_len,
position_ids=position_ids,
dropout=dropout_rate,
sliding_window=getattr(self, "sliding_window", None),
use_top_left_mask=self._flash_attn_uses_top_left_mask,
is_causal=self.is_causal,
**kwargs,
)
attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
########## AlltoAll for Ulysses ##########
if ulysses_sp_size > 1:
attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
def llama_attn_forward(
self,
hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""
Adapted from transformers 4.49.0 to support Ulysses sequence parallelism for transformers >= 4.48.0.
NOTE: This function has been tested only on transformers versions between 4.48.0 and 4.50.0.
"""
from transformers.models.llama.modeling_llama import eager_attention_forward
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
bsz, q_len, _ = hidden_states.shape
query_states = self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
########## AlltoAll for Ulysses ##########
ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
if ulysses_sp_size > 1:
validate_ulysses_config(self.config.num_attention_heads, ulysses_sp_size)
query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
full_q_len = query_states.size(2)
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
query_states,
key_states,
value_states,
attention_mask,
dropout=0.0 if not self.training else self.attention_dropout,
scaling=self.scaling,
**kwargs,
)
attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
########## AlltoAll for Ulysses ##########
if ulysses_sp_size > 1:
attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
attn_output = self.o_proj(attn_output)
return attn_output, attn_weights
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Apply monkey-patch function to models
"""
import sys
from typing import Optional
import torch
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from verl.utils.ulysses import (
gather_heads_scatter_seq,
gather_seq_scatter_heads,
get_ulysses_sequence_parallel_world_size,
get_ulysses_sequence_parallel_group,
)
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=2, repeats=n_rep). The hidden states go from (batch,
seqlen, num_key_value_heads, head_dim) to (batch, seqlen, num_attention_heads, head_dim)
"""
batch, slen, num_key_value_heads, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, :, None, :].expand(batch, slen, num_key_value_heads, n_rep, head_dim)
return hidden_states.reshape(batch, slen, num_key_value_heads * n_rep, head_dim)
def _ulysses_flash_attention_forward(
query_states: torch.Tensor,
key_states: torch.Tensor,
value_states: torch.Tensor,
*args,
position_ids: Optional[torch.Tensor] = None,
**kwargs,
):
"""Insert all-to-all before and after flash attention.
DeepSpeed-Ulysses: https://arxiv.org/pdf/2309.14509
Args:
query_states (torch.Tensor): (batch_size, seqlen/sp_size, nheads, head_dim)
key_states (torch.Tensor): (batch_size, seqlen/sp_size, nheads_k, head_dim)
value_states (torch.Tensor): (batch_size, seqlen/sp_size, nheads_k, head_dim)
position_ids (torch.Tensor, optional): (batch_size, seqlen/sp_size)
Returns:
torch.Tensor: (batch_size, seqlen/sp_size, nheads, head_dim)
"""
ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
########## AlltoAll for Ulysses ##########
if ulysses_sp_size > 1:
assert position_ids is not None, "position_ids is required for Ulysses sequence parallelism"
# NOTE: repeat kv heads to be divided by sequence parallel. Instead of repeating nheads_q//nheads_k,
# we choose to repeat sp_size//nheads_k, since flash_attention supports MQA/GQA.
# For example:
# - nheads_k=4, sp=8, repeats=2
# - nheads_k=8, sp=8, repeats=1
# - nheads_k=16, sp=8, repeats=1
repeats = max(ulysses_sp_size // key_states.size(2), 1)
key_states = repeat_kv(key_states, repeats)
value_states = repeat_kv(value_states, repeats)
# (bsz, seq_len/n, n_head, head_dim) -> (bsz, seq_len, n_head/n, head_dim)
query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2)
key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2)
value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2)
# TODO: all_gather position_ids because `prepare_fa2_from_position_ids` needs it, we can eliminate
# this all_gather by passing cu_seq_lens_q, cu_seq_lens_k, max_length_k, max_length_q explicitly.
# https://github.com/huggingface/transformers/pull/33932
# (bsz, seq_len/n) -> (bsz, seq_len)
position_ids_list = [torch.empty_like(position_ids) for _ in range(ulysses_sp_size)]
torch.distributed.all_gather(position_ids_list, position_ids, group=get_ulysses_sequence_parallel_group())
position_ids = torch.concat(position_ids_list, dim=-1)
# (bsz, seq_len, n_head/n, head_dim)
attn_output = _flash_attention_forward(query_states,
key_states,
value_states,
*args,
position_ids=position_ids,
**kwargs)
########## AlltoAll for Ulysses ##########
if ulysses_sp_size > 1:
# (bsz, seq_len, n_head/n, head_dim) -> (bsz, seq_len/n, n_head, head_dim)
attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
return attn_output
def apply_monkey_patch(model: PreTrainedModel, ulysses_sp_size: int):
"""Replace _flash_attention_forward to _ulysses_flash_attention_forward"""
module = sys.modules[model.__module__]
num_attention_heads, num_key_value_heads = model.config.num_attention_heads, model.config.num_key_value_heads
assert num_attention_heads % ulysses_sp_size == 0, \
f"num_attention_heads {num_attention_heads} must be divisible by ulysses_sp_size {ulysses_sp_size}"
assert num_key_value_heads % ulysses_sp_size == 0 or ulysses_sp_size % num_key_value_heads == 0, (
f"num_key_value_heads {num_key_value_heads} must be divisible by ulysses_sp_size {ulysses_sp_size}"
f"or vise versa. Upon ulysses_sp_size % num_key_value_heads == 0,"
f"kv heads are repeated to ensure correctness.")
# TODO: VLM models only, unify monkey patch to LLM models.
if model.config.model_type in ("qwen2_vl", "qwen2_5_vl"): # patch remove padding for qwen2vl mrope
from verl.models.transformers.qwen2_vl import ulysses_flash_attn_forward
from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLFlashAttention2
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLFlashAttention2
Qwen2VLFlashAttention2.forward = ulysses_flash_attn_forward
Qwen2_5_VLFlashAttention2.forward = ulysses_flash_attn_forward
print("Monkey patch FlashAttention2.forward in Qwen2VL")
return
# transformers<=4.47.1
if hasattr(module, "_flash_attention_forward"):
module._flash_attention_forward = _ulysses_flash_attention_forward
print(f"Monkey patch _flash_attention_forward in {model.__module__}")
else:
# transformers>=4.48.0
from transformers.integrations import flash_attention
flash_attention._flash_attention_forward = _ulysses_flash_attention_forward
print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}")
from functools import lru_cache
from packaging import version
import importlib.metadata
@lru_cache()
def is_transformers_version_in_range(min_version: str, max_version: str) -> bool:
try:
# Get the installed version of the transformers library
transformers_version = importlib.metadata.version("transformers")
except importlib.metadata.PackageNotFoundError:
raise ModuleNotFoundError("The `transformers` package is not installed.")
# Check if the version is within the specified range
return version.parse(min_version) <= version.parse(transformers_version) <= version.parse(max_version)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from typing import Optional, Tuple, Callable
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
from transformers.cache_utils import Cache
from transformers.utils import logging
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from transformers.processing_utils import Unpack
from verl.utils.ulysses import gather_heads_scatter_seq, gather_seq_scatter_heads, \
get_ulysses_sequence_parallel_world_size, validate_ulysses_config
logger = logging.get_logger(__name__)
def qwen2_flash_attn_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
):
"""
Adapted from transformers 4.47.1 to support Ulysses sequence parallelism.
NOTE: This function is only tested on transformers versions between 4.45.0 and 4.47.1.
"""
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
########## AlltoAll for Ulysses ##########
ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
if ulysses_sp_size > 1:
validate_ulysses_config(self.num_heads, ulysses_sp_size)
# (bsz, n_head, seq_len/n, head_dim) -> (bsz, n_head/n, seq_len, head_dim)
query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
full_q_len = query_states.size(2) # full seq length
if position_embeddings is None:
logger.warning_once(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory.")
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# repeat k/v heads if n_kv_heads < n_heads
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
dropout_rate = 0.0 if not self.training else self.attention_dropout
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in float16 just to be sure everything works as expected.
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}.")
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
# Reashape to the expected shape for Flash Attention
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None and
self.layer_idx >= self.config.max_window_layers):
sliding_window = self.config.sliding_window
else:
sliding_window = None
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
full_q_len,
position_ids=position_ids,
dropout=dropout_rate,
sliding_window=sliding_window,
is_causal=self.is_causal,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
)
# use full_q_len to reshape
attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
########## AlltoAll for Ulysses ##########
if ulysses_sp_size > 1:
attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
def qwen2_attn_forward(
self,
hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""
Adapted from transformers 4.49.0 to support Ulysses sequence parallelism for transformers >= 4.48.0.
NOTE: This function has been tested only on transformers versions between 4.48.0 and 4.50.0.
"""
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
bsz, q_len, _ = hidden_states.shape
hidden_shape = (bsz, q_len, -1, self.head_dim)
query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
########## AlltoAll for Ulysses ##########
ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
if ulysses_sp_size > 1:
validate_ulysses_config(self.config.num_attention_heads, ulysses_sp_size)
# (bsz, n_head, seq_len/n, head_dim) -> (bsz, n_head/n, seq_len, head_dim)
query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
full_q_len = query_states.size(2)
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
sliding_window = None
if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None and
self.layer_idx >= self.config.max_window_layers):
sliding_window = self.config.sliding_window
from transformers.models.qwen2.modeling_qwen2 import eager_attention_forward
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
query_states,
key_states,
value_states,
attention_mask,
dropout=0.0 if not self.training else self.attention_dropout,
scaling=self.scaling,
sliding_window=sliding_window, # main diff with Llama
**kwargs,
)
attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
########## AlltoAll for Ulysses ##########
if ulysses_sp_size > 1:
# (bsz, seq_len, n_head/n, head_dim) -> (bsz, seq_len/n, n_head, head_dim)
attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
attn_output = self.o_proj(attn_output)
return attn_output, attn_weights
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple
import inspect
import torch
import os
from transformers.utils import is_flash_attn_greater_or_equal
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from verl.utils.ulysses import gather_heads_scatter_seq, gather_seq_scatter_heads, \
get_ulysses_sequence_parallel_world_size, validate_ulysses_config
try:
from flash_attn import flash_attn_func, flash_attn_varlen_func
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
except ImportError:
flash_attn_varlen_func = None
def get_rope_index(
processor,
input_ids: torch.Tensor,
image_grid_thw: Optional[torch.Tensor] = None,
video_grid_thw: Optional[torch.Tensor] = None,
second_per_grid_ts: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""
Gets the position ids for Qwen2-VL, it should be generated before sharding the sequence.
The batch dim has been removed and the input_ids should be a 1D tensor representing a single example.
https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1546
"""
spatial_merge_size = processor.image_processor.merge_size
tokens_per_second = 2
image_token_id = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
video_token_id = processor.tokenizer.convert_tokens_to_ids("<|video_pad|>")
vision_start_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_start|>")
if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
position_ids = torch.ones(3, input_ids.size(0), dtype=input_ids.dtype, device=input_ids.device) # (3, seqlen)
image_index, video_index = 0, 0
input_ids = input_ids[attention_mask == 1]
image_nums, video_nums = 0, 0
vision_start_indices = torch.argwhere(input_ids == vision_start_token_id)
vision_tokens = input_ids[vision_start_indices + 1]
image_nums = (vision_tokens == image_token_id).sum()
video_nums = (vision_tokens == video_token_id).sum()
input_tokens = input_ids.tolist()
llm_pos_ids_list: list = []
st = 0
remain_images, remain_videos = image_nums, video_nums
for _ in range(image_nums + video_nums):
if image_token_id in input_tokens and remain_images > 0:
ed_image = input_tokens.index(image_token_id, st)
else:
ed_image = len(input_tokens) + 1
if video_token_id in input_tokens and remain_videos > 0:
ed_video = input_tokens.index(video_token_id, st)
else:
ed_video = len(input_tokens) + 1
if ed_image < ed_video:
t, h, w = (
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
second_per_grid_t = 0
image_index += 1
remain_images -= 1
ed = ed_image
else:
t, h, w = (
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
if second_per_grid_ts is not None:
second_per_grid_t = second_per_grid_ts[video_index]
else:
second_per_grid_t = 1.0
video_index += 1
remain_videos -= 1
ed = ed_video
llm_grid_t, llm_grid_h, llm_grid_w = (
t.item(),
h.item() // spatial_merge_size,
w.item() // spatial_merge_size,
)
text_len = ed - st
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
t_index = (t_index * second_per_grid_t * tokens_per_second).long().flatten()
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
st = ed + llm_grid_t * llm_grid_h * llm_grid_w
if st < len(input_tokens):
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
text_len = len(input_tokens) - st
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
position_ids[..., attention_mask == 1] = llm_positions.to(position_ids.device)
else:
if attention_mask is not None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
position_ids = position_ids.unsqueeze(0).expand(3, -1).to(input_ids.device)
else:
position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).view(1, -1).expand(3, -1)
return position_ids
def prepare_fa2_from_position_ids(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
position_ids: torch.Tensor):
query = query.view(-1, query.size(-2), query.size(-1))
key = key.view(-1, key.size(-2), key.size(-1))
value = value.view(-1, value.size(-2), value.size(-1))
position_ids = position_ids.flatten()
indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
cu_seqlens = torch.cat((
indices_q[position_ids == 0],
torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
))
max_length = cu_seqlens.diff().max() # use cu_seqlens to infer max_length for qwen2vl mrope
return (query, key, value, indices_q, (cu_seqlens, cu_seqlens), (max_length, max_length))
def flash_attention_forward(
query_states: torch.Tensor,
key_states: torch.Tensor,
value_states: torch.Tensor,
attention_mask: torch.Tensor,
query_length: int,
is_causal: bool = True,
position_ids: Optional[torch.Tensor] = None,
sliding_window: Optional[int] = None,
use_top_left_mask: bool = False,
deterministic: Optional[bool] = None,
**kwargs,
):
"""
Patches flash attention forward to handle 3D position ids in mrope. (3, batch_size, seq_length)
"""
if not use_top_left_mask:
causal = is_causal
else:
causal = is_causal and query_length != 1
# Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
use_sliding_windows = (_flash_supports_window_size and sliding_window is not None and
key_states.shape[1] > sliding_window)
flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
if is_flash_attn_greater_or_equal("2.4.1"):
if deterministic is None:
deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
flash_kwargs["deterministic"] = deterministic
if position_ids is not None and query_length != 1 and not (torch.diff(position_ids[0], dim=-1) >= 0).all():
batch_size = query_states.size(0)
query_states, key_states, value_states, _, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
query_states, key_states, value_states, position_ids[0]) # remove channel dimension
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=kwargs.pop("dropout", 0.0),
softmax_scale=kwargs.pop("softmax_scale", None),
causal=causal,
**flash_kwargs,
)
attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1))
else:
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
query_length,
is_causal=is_causal,
sliding_window=sliding_window,
use_top_left_mask=use_top_left_mask,
deterministic=deterministic,
**kwargs,
) # do not pass position_ids to old flash_attention_forward
return attn_output
def ulysses_flash_attn_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, None, None]:
from transformers.models.qwen2_vl.modeling_qwen2_vl import repeat_kv, apply_multimodal_rotary_pos_emb
bsz, q_len, _ = hidden_states.size() # q_len = seq_length / sp_size
query_states = self.q_proj(hidden_states) # (batch_size, seq_length / sp_size, num_heads * head_size)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
if ulysses_sp_size > 1:
validate_ulysses_config(self.num_heads, ulysses_sp_size)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
# (batch_size, num_head / sp_size, seq_length, head_size)
full_q_len = query_states.size(2) # full_q_len = seq_length
else:
full_q_len = q_len
# Because the input can be padded, the absolute sequence length depends on the max position id.
if position_embeddings is None:
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
query_states, key_states = apply_multimodal_rotary_pos_emb(query_states, key_states, cos, sin,
self.rope_scaling["mrope_section"])
dropout_rate = 0.0 if not self.training else self.attention_dropout
# Reashape to the expected shape for Flash Attention
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None and
self.layer_idx >= self.config.max_window_layers):
sliding_window = self.config.sliding_window
else:
sliding_window = None
attn_output = flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
full_q_len,
dropout=dropout_rate,
sliding_window=sliding_window,
is_causal=self.is_causal,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
position_ids=position_ids, # important: pass position ids
) # (batch_size, seq_length, num_head / sp_size, head_size)
if ulysses_sp_size > 1:
attn_output = gather_heads_scatter_seq(attn_output, head_dim=2, seq_dim=1)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
attn_output = self.o_proj(attn_output)
return attn_output, None, None
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def get_weight_loader(arch: str):
from verl.models.llama.megatron.checkpoint_utils.llama_loader import load_state_dict_to_megatron_llama
from verl.models.qwen2.megatron.checkpoint_utils.qwen2_loader import load_state_dict_to_megatron_qwen2
from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
_MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY = {
'LlamaForCausalLM': load_state_dict_to_megatron_gptmodel,
'Qwen2ForCausalLM': load_state_dict_to_megatron_gptmodel,
}
if arch in _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY:
return _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY[arch]
raise ValueError(f"Model architectures {arch} loader are not supported for now. "
f"Supported architectures: {_MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY.keys()}")
def get_weight_saver(arch: str):
from verl.models.llama.megatron.checkpoint_utils.llama_saver import merge_megatron_ckpt_llama
from verl.models.qwen2.megatron.checkpoint_utils.qwen2_saver import merge_megatron_ckpt_qwen2
from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel
_MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = {
'LlamaForCausalLM': merge_megatron_ckpt_gptmodel,
'Qwen2ForCausalLM': merge_megatron_ckpt_gptmodel,
}
if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY:
return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch]
raise ValueError(f"Model architectures {arch} saver are not supported for now. "
f"Supported architectures: {_MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY.keys()}")
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Implement base data transfer protocol between any two functions, modules.
We can subclass Protocol to define more detailed batch info with specific keys
"""
import pickle
import numpy as np
import pandas as pd
import copy
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Union
import torch
import tensordict
from packaging import version
from tensordict import TensorDict
from torch.utils.data import DataLoader, Dataset
from verl.utils.py_functional import union_two_dict
__all__ = ['DataProto', 'union_tensor_dict']
try:
tensordict.set_lazy_legacy(False).set()
except:
pass
def pad_dataproto_to_divisor(data: 'DataProto', size_divisor: int):
"""Pad a DataProto to size divisible by size_divisor
Args:
size_divisor (int): size divisor
Returns:
data: (DataProto): the padded DataProto
pad_size (int)
"""
assert isinstance(data, DataProto), 'data must be a DataProto'
if len(data) % size_divisor != 0:
pad_size = size_divisor - len(data) % size_divisor
padding_protos = []
remaining_pad = pad_size
while remaining_pad > 0:
take_size = min(remaining_pad, len(data))
padding_protos.append(data[:take_size])
remaining_pad -= take_size
data_padded = DataProto.concat([data] + padding_protos)
else:
pad_size = 0
data_padded = data
return data_padded, pad_size
def unpad_dataproto(data: 'DataProto', pad_size):
if pad_size != 0:
data = data[:-pad_size]
return data
def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> TensorDict:
"""Union two tensordicts."""
assert tensor_dict1.batch_size == tensor_dict2.batch_size, \
f'Two tensor dict must have identical batch size. Got {tensor_dict1.batch_size} and {tensor_dict2.batch_size}'
for key in tensor_dict2.keys():
if key not in tensor_dict1.keys():
tensor_dict1[key] = tensor_dict2[key]
else:
assert tensor_dict1[key].equal(tensor_dict2[key]), \
f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
return tensor_dict1
def union_numpy_dict(tensor_dict1: dict[str, np.ndarray], tensor_dict2: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
for key, val in tensor_dict2.items():
if key in tensor_dict1:
assert isinstance(tensor_dict2[key], np.ndarray)
assert isinstance(tensor_dict1[key], np.ndarray)
# to properly deal with nan and object type
assert pd.DataFrame(tensor_dict2[key]).equals(pd.DataFrame(tensor_dict1[key])), \
f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
tensor_dict1[key] = val
return tensor_dict1
def list_of_dict_to_dict_of_list(list_of_dict: list[dict]):
if len(list_of_dict) == 0:
return {}
keys = list_of_dict[0].keys()
output = {key: [] for key in keys}
for data in list_of_dict:
for key, item in data.items():
assert key in output
output[key].append(item)
return output
def fold_batch_dim(data: 'DataProto', new_batch_size):
"""
Fold a batch dim from [bsz, xxx] into [new_bsz, bsz // new_bsz, xxx]
"""
batch_size = data.batch.batch_size[0]
assert batch_size % new_batch_size == 0
tensor: TensorDict = data.batch
non_tensor = data.non_tensor_batch
tensor = tensor.view(new_batch_size, -1)
tensor.auto_batch_size_(batch_dims=1)
for key, val in non_tensor.items():
non_tensor[key] = np.reshape(val, newshape=(new_batch_size, -1, *val.shape[1:]))
return DataProto(batch=tensor, non_tensor_batch=non_tensor, meta_info=data.meta_info)
def unfold_batch_dim(data: 'DataProto', batch_dims=2):
"""
Unfold the first n dims as new batch dim
"""
tensor: TensorDict = data.batch
non_tensor = data.non_tensor_batch
tensor.auto_batch_size_(batch_dims=batch_dims)
tensor = tensor.view(-1)
batch_size = tensor.batch_size[0]
non_tensor_new = {}
for key, val in non_tensor.items():
non_tensor_new[key] = np.reshape(val, newshape=(batch_size, *val.shape[batch_dims:]))
return DataProto(batch=tensor, non_tensor_batch=non_tensor_new, meta_info=data.meta_info)
def collate_fn(x: list['DataProtoItem']):
batch = []
non_tensor_batch = []
for data in x:
batch.append(data.batch)
non_tensor_batch.append(data.non_tensor_batch)
batch = torch.stack(batch).contiguous()
non_tensor_batch = list_of_dict_to_dict_of_list(non_tensor_batch)
for key, val in non_tensor_batch.items():
non_tensor_batch[key] = np.array(val, dtype=object)
return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)
@dataclass
class DataProtoItem:
# TODO(zhangchi.usc1992) add consistency check
batch: TensorDict = None
non_tensor_batch: Dict = field(default_factory=dict)
meta_info: Dict = field(default_factory=dict)
@dataclass
class DataProto:
"""
A DataProto is a data structure that aims to provide a standard protocol for data exchange between functions.
It contains a batch (TensorDict) and a meta_info (Dict). The batch is a TensorDict https://pytorch.org/tensordict/.
TensorDict allows you to manipulate a dictionary of Tensors like a single Tensor. Ideally, the tensors with the
same batch size should be put inside batch.
"""
batch: TensorDict = None
non_tensor_batch: Dict = field(default_factory=dict)
meta_info: Dict = field(default_factory=dict)
def __post_init__(self):
# perform necessary checking
self.check_consistency()
def __len__(self):
if self.batch is not None:
return self.batch.batch_size[0]
elif self.non_tensor_batch is not None and len(self.non_tensor_batch) > 0:
random_key = list(self.non_tensor_batch.keys())[0]
return self.non_tensor_batch[random_key].shape[0]
else:
return 0
def __getitem__(self, item):
"""
Enhanced indexing for DataProto objects.
Args:
item: Can be one of:
- int: A single index
- slice: A slice object (start:stop:step)
- list: A list of indices
- numpy.ndarray: An array of indices
- torch.Tensor: A tensor of indices
Returns:
DataProto: For all indexing types except single integers
DataProtoItem: Only for single integer indices
"""
# Case 1: Slice object - use the slice method
if isinstance(item, slice):
return self.slice(item.start, item.stop, item.step)
# Case 2: List, numpy array, or torch tensor - use sel_idxs
elif isinstance(item, (list, np.ndarray, torch.Tensor)):
return self.select_idxs(item)
# Case 3: Single integer - return DataProtoItem for backward compatibility
elif isinstance(item, (int, np.integer)):
tensor_data = self.batch[item]
non_tensor_data = {key: val[item] for key, val in self.non_tensor_batch.items()}
return DataProtoItem(batch=tensor_data, non_tensor_batch=non_tensor_data, meta_info=self.meta_info)
# Case 4: Unsupported type
else:
raise TypeError(f"Indexing with {type(item)} is not supported")
def __getstate__(self):
import io
buffer = io.BytesIO()
if version.parse(tensordict.__version__) >= version.parse('0.5.0') and self.batch is not None:
self.batch = self.batch.contiguous()
self.batch = self.batch.consolidate()
torch.save(self.batch, buffer)
buffer_bytes = buffer.getvalue()
return buffer_bytes, self.non_tensor_batch, self.meta_info
def __setstate__(self, data):
import io
batch_deserialized_bytes, non_tensor_batch, meta_info = data
batch_deserialized = io.BytesIO(initial_bytes=batch_deserialized_bytes)
batch = torch.load(batch_deserialized,
weights_only=False,
map_location='cpu' if not torch.cuda.is_available() else None)
self.batch = batch
self.non_tensor_batch = non_tensor_batch
self.meta_info = meta_info
def save_to_disk(self, filepath):
with open(filepath, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load_from_disk(filepath) -> 'DataProto':
with open(filepath, 'rb') as f:
data = pickle.load(f)
return data
def print_size(self, prefix=""):
size_of_tensordict = 0
for key, tensor in self.batch.items():
size_of_tensordict += tensor.element_size() * tensor.numel()
size_of_numpy_array = 0
for key, numpy_array in self.non_tensor_batch.items():
size_of_numpy_array += numpy_array.nbytes
size_of_numpy_array /= 1024**3
size_of_tensordict /= 1024**3
message = f'Size of tensordict: {size_of_tensordict} GB, size of non_tensor_batch: {size_of_numpy_array} GB'
if prefix:
message = f'{prefix}, ' + message
print(message)
def check_consistency(self):
"""Check the consistency of the DataProto. Mainly for batch and non_tensor_batch
We expose this function as a public one so that user can call themselves directly
"""
if self.batch is not None:
assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1'
if self.non_tensor_batch is not None:
for key, val in self.non_tensor_batch.items():
assert isinstance(val, np.ndarray)
if self.batch is not None and len(self.non_tensor_batch) != 0:
# TODO: we can actually lift this restriction if needed
assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1 when non_tensor_batch is not empty.'
batch_size = self.batch.batch_size[0]
for key, val in self.non_tensor_batch.items():
assert isinstance(
val, np.ndarray
), f'data in the non_tensor_batch must be a numpy.array with dtype=object, but for {key=}, got {type(val)=}'
assert val.shape[
0] == batch_size, f'key {key} length {len(val)} is not equal to batch size {batch_size}'
@classmethod
def from_single_dict(cls, data: Dict[str, Union[torch.Tensor, np.ndarray]], meta_info=None):
tensors = {}
non_tensors = {}
for key, val in data.items():
if isinstance(val, torch.Tensor):
tensors[key] = val
elif isinstance(val, np.ndarray):
non_tensors[key] = val
else:
raise ValueError(f'Unsupported type in data {type(val)}')
return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
@classmethod
def from_dict(cls, tensors: Dict[str, torch.Tensor], non_tensors=None, meta_info=None, num_batch_dims=1):
"""Create a DataProto from a dict of tensors. This assumes that
1. All the tensor in tensors have the same dim0
2. Only dim0 is the batch dim
"""
assert len(tensors) > 0, 'tensors must not be empty'
assert num_batch_dims > 0, 'num_batch_dims must be greater than zero'
if non_tensors is not None:
assert num_batch_dims == 1, 'only support num_batch_dims=1 when non_tensors is not None.'
if meta_info is None:
meta_info = {}
if non_tensors is None:
non_tensors = {}
assert isinstance(non_tensors, dict)
# get and check batch size
batch_size = None
pivot_key = None
for key, tensor in tensors.items():
if batch_size is None:
batch_size = tensor.shape[:num_batch_dims]
pivot_key = key
else:
current_batch = tensor.shape[:num_batch_dims]
assert batch_size == current_batch, \
f'Not all the tensor in tensors have the same batch size with batch_dims={num_batch_dims}. Got {pivot_key} has {batch_size}, {key} has {current_batch}'
for key, val in non_tensors.items():
non_tensors[key] = np.array(val, dtype=object)
tensor_dict = TensorDict(source=tensors, batch_size=batch_size)
return cls(batch=tensor_dict, non_tensor_batch=non_tensors, meta_info=meta_info)
def to(self, device) -> 'DataProto':
"""move the batch to device
Args:
device (torch.device, str): torch device
Returns:
DataProto: the current DataProto
"""
if self.batch is not None:
self.batch = self.batch.to(device)
return self
def select(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None, deepcopy=False) -> 'DataProto':
"""Select a subset of the DataProto via batch_keys and meta_info_keys
Args:
batch_keys (list, optional): a list of strings indicating the keys in batch to select
meta_info_keys (list, optional): a list of keys indicating the meta info to select
Returns:
DataProto: the DataProto with the selected batch_keys and meta_info_keys
"""
# TODO (zhangchi.usc1992) whether to copy
if batch_keys is not None:
batch_keys = tuple(batch_keys)
sub_batch = self.batch.select(*batch_keys)
else:
sub_batch = self.batch
if non_tensor_batch_keys is not None:
non_tensor_batch = {key: val for key, val in self.non_tensor_batch.items() if key in non_tensor_batch_keys}
else:
non_tensor_batch = self.non_tensor_batch
if deepcopy:
non_tensor_batch = copy.deepcopy(non_tensor_batch)
if meta_info_keys is not None:
sub_meta_info = {key: val for key, val in self.meta_info.items() if key in meta_info_keys}
else:
sub_meta_info = self.meta_info
if deepcopy:
sub_meta_info = copy.deepcopy(sub_meta_info)
return DataProto(batch=sub_batch, non_tensor_batch=non_tensor_batch, meta_info=sub_meta_info)
def select_idxs(self, idxs):
"""
Select specific indices from the DataProto.
Args:
idxs (torch.Tensor or numpy.ndarray or list): Indices to select
Returns:
DataProto: A new DataProto containing only the selected indices
"""
if isinstance(idxs, list):
idxs = torch.tensor(idxs, dtype=torch.int32)
if isinstance(idxs, np.ndarray):
idxs_np = idxs
idxs_torch = torch.from_numpy(idxs)
else: # torch.Tensor
idxs_torch = idxs
idxs_np = idxs.detach().cpu().numpy()
if self.batch is not None:
# Use TensorDict's built-in indexing capabilities
selected_batch = TensorDict(source={
key: tensor[idxs_torch] for key, tensor in self.batch.items()
},
batch_size=(idxs_torch.shape[0],))
else:
selected_batch = None
selected_non_tensor = {}
for key, val in self.non_tensor_batch.items():
selected_non_tensor[key] = val[idxs_np]
return DataProto(batch=selected_batch, non_tensor_batch=selected_non_tensor, meta_info=self.meta_info)
def slice(self, start=None, end=None, step=None):
"""
Slice the DataProto and return a new DataProto object.
This is an improved version of direct slicing which returns a DataProtoItem.
Args:
start (int, optional): Start index. Defaults to None (start from beginning).
end (int, optional): End index (exclusive). Defaults to None (go to end).
step (int, optional): Step size. Defaults to None (step=1).
Returns:
DataProto: A new DataProto containing the sliced data
Examples:
# Using the slice method directly
sliced_data = data_proto.slice(10, 20)
# Using enhanced indexing (returns DataProto)
sliced_data = data_proto[10:20]
sliced_data = data_proto[::2] # Every other element
# Using list indexing (returns DataProto)
indices = [1, 5, 10]
selected_data = data_proto[indices]
# Single index still returns DataProtoItem
single_item = data_proto[5]
"""
# Create a slice object
slice_obj = slice(start, end, step)
# Handle the batch data
if self.batch is not None:
# Use TensorDict's built-in slicing capabilities
sliced_batch = self.batch[slice_obj]
else:
sliced_batch = None
# Handle the non-tensor batch data
sliced_non_tensor = {}
for key, val in self.non_tensor_batch.items():
sliced_non_tensor[key] = val[slice_obj]
# Return a new DataProto object
return DataProto(batch=sliced_batch, non_tensor_batch=sliced_non_tensor, meta_info=self.meta_info)
def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None) -> 'DataProto':
"""Pop a subset of the DataProto via `batch_keys` and `meta_info_keys`
Args:
batch_keys (list, optional): a list of strings indicating the keys in batch to pop
meta_info_keys (list, optional): a list of keys indicating the meta info to pop
Returns:
DataProto: the DataProto with the poped batch_keys and meta_info_keys
"""
assert batch_keys is not None
if meta_info_keys is None:
meta_info_keys = []
if non_tensor_batch_keys is None:
non_tensor_batch_keys = []
tensors = {}
# tensor batch
for key in batch_keys:
assert key in self.batch.keys()
tensors[key] = self.batch.pop(key)
non_tensors = {}
# non tensor batch
for key in non_tensor_batch_keys:
assert key in self.non_tensor_batch.keys()
non_tensors[key] = self.non_tensor_batch.pop(key)
meta_info = {}
for key in meta_info_keys:
assert key in self.meta_info.keys()
meta_info[key] = self.meta_info.pop(key)
return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
def rename(self, old_keys=None, new_keys=None) -> 'DataProto':
"""
Note that this function only rename the key in the batch
"""
def validate_input(keys):
if keys is not None:
if isinstance(keys, str):
keys = [keys]
elif isinstance(keys, list):
pass
else:
raise TypeError(f'keys must be a list or a string, but got {type(keys)}')
return keys
old_keys = validate_input(old_keys)
new_keys = validate_input(new_keys)
if len(new_keys) != len(old_keys):
raise ValueError(
f'new_keys and old_keys must have the same length, but got {len(new_keys)} and {len(old_keys)}')
self.batch.rename_key_(tuple(old_keys), tuple(new_keys))
return self
def union(self, other: 'DataProto') -> 'DataProto':
"""Union with another DataProto. Union batch and meta_info separately.
Throw an error if
- there are conflict keys in batch and they are not equal
- the batch size of two data batch is not the same
- there are conflict keys in meta_info and they are not the same.
Args:
other (DataProto): another DataProto to union
Returns:
DataProto: the DataProto after union
"""
self.batch = union_tensor_dict(self.batch, other.batch)
self.non_tensor_batch = union_numpy_dict(self.non_tensor_batch, other.non_tensor_batch)
self.meta_info = union_two_dict(self.meta_info, other.meta_info)
return self
def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None):
r"""Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
Args:
mini_batch_size (int): mini-batch size when iterating the dataset. We require that ``batch.batch_size[0] % mini_batch_size == 0``.
epochs (int): number of epochs when iterating the dataset.
dataloader_kwargs (Any): internally, it returns a DataLoader over the batch. The dataloader_kwargs is the kwargs passed to the DataLoader.
Returns:
Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is ``self.batch.batch_size * epochs // mini_batch_size``
"""
assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0"
# we can directly create a dataloader from TensorDict
if dataloader_kwargs is None:
dataloader_kwargs = {}
if seed is not None:
generator = torch.Generator()
generator.manual_seed(seed)
else:
generator = None
assert isinstance(dataloader_kwargs, Dict)
train_dataloader = DataLoader(dataset=self,
batch_size=mini_batch_size,
collate_fn=collate_fn,
generator=generator,
**dataloader_kwargs)
def get_data():
for _ in range(epochs):
for d in train_dataloader:
d.meta_info = self.meta_info
yield d
return iter(get_data())
def chunk(self, chunks: int) -> List['DataProto']:
"""Split the batch among dim=0 into chunks. The meta_info is passed to each DataProto after split.
Args:
chunks (int): the number of chunks to split on dim=0
Returns:
List[DataProto]: a list of DataProto after splitting
"""
assert len(
self) % chunks == 0, f'only support equal chunk. Got size of DataProto {len(self)} and chunk {chunks}.'
if self.batch is not None:
batch_lst = self.batch.chunk(chunks=chunks, dim=0)
else:
batch_lst = [None for _ in range(chunks)]
non_tensor_batch_lst = [{} for _ in range(chunks)]
for key, val in self.non_tensor_batch.items():
assert isinstance(val, np.ndarray)
non_tensor_lst = np.array_split(val, chunks)
assert len(non_tensor_lst) == chunks
for i in range(chunks):
non_tensor_batch_lst[i][key] = non_tensor_lst[i]
output = []
for i in range(chunks):
output.append(
DataProto(batch=batch_lst[i], non_tensor_batch=non_tensor_batch_lst[i], meta_info=self.meta_info))
return output
@staticmethod
def concat(data: List['DataProto']) -> 'DataProto':
"""Concat a list of DataProto. The batch is concatenated among dim=0.
The meta_info is assumed to be identical and will use the first one.
Args:
data (List[DataProto]): list of DataProto
Returns:
DataProto: concatenated DataProto
"""
batch_lst = []
for batch in data:
batch_lst.append(batch.batch)
if batch_lst[0] is not None:
new_batch = torch.cat(batch_lst, dim=0)
else:
new_batch = None
non_tensor_batch = list_of_dict_to_dict_of_list(list_of_dict=[d.non_tensor_batch for d in data])
for key, val in non_tensor_batch.items():
non_tensor_batch[key] = np.concatenate(val, axis=0)
return DataProto(batch=new_batch, non_tensor_batch=non_tensor_batch, meta_info=data[0].meta_info)
def reorder(self, indices):
"""
Note that this operation is in-place
"""
indices_np = indices.detach().numpy()
self.batch = self.batch[indices]
self.non_tensor_batch = {key: val[indices_np] for key, val in self.non_tensor_batch.items()}
def repeat(self, repeat_times=2, interleave=True):
"""
Repeat the batch data a specified number of times.
Args:
repeat_times (int): Number of times to repeat the data.
interleave (bool): Whether to interleave the repeated data.
Returns:
DataProto: A new DataProto with repeated data.
"""
if self.batch is not None:
if interleave:
# Interleave the data
repeated_tensors = {
key: tensor.repeat_interleave(repeat_times, dim=0) for key, tensor in self.batch.items()
}
else:
# Stack the data
repeated_tensors = {
key: tensor.unsqueeze(0).expand(repeat_times, *tensor.shape).reshape(-1, *tensor.shape[1:])
for key, tensor in self.batch.items()
}
repeated_batch = TensorDict(
source=repeated_tensors,
batch_size=(self.batch.batch_size[0] * repeat_times,),
)
else:
repeated_batch = None
repeated_non_tensor_batch = {}
for key, val in self.non_tensor_batch.items():
if interleave:
repeated_non_tensor_batch[key] = np.repeat(val, repeat_times, axis=0)
else:
repeated_non_tensor_batch[key] = np.tile(val, (repeat_times,) + (1,) * (val.ndim - 1))
return DataProto(
batch=repeated_batch,
non_tensor_batch=repeated_non_tensor_batch,
meta_info=self.meta_info,
)
import ray
@dataclass
class DataProtoFuture:
"""
DataProtoFuture aims to eliminate actual data fetching on driver. By doing so, the driver doesn't have to wait
for data so that asynchronous execution becomes possible.
DataProtoFuture contains a list of futures from another WorkerGroup of size world_size.
- collect_fn is a Callable that reduces the list of futures to a DataProto
- dispatch_fn is a Callable that partitions the DataProto into a list of DataProto of size world_size and then select
Potential issue: we can optimize dispatch_fn(collect_fn) such that only needed data is fetched on destination
- DataProtoFuture only supports directly passing from the output of a method to another input. You can't perform any
operation on the DataProtoFuture in driver.
"""
collect_fn: Callable
futures: List[ray.ObjectRef]
dispatch_fn: Callable = None
@staticmethod
def concat(data: List[ray.ObjectRef]) -> 'DataProtoFuture':
output = DataProtoFuture(collect_fn=DataProto.concat, futures=data)
return output
def chunk(self, chunks: int) -> List['DataProtoFuture']:
from functools import partial
arg_future_lst = []
for i in range(chunks):
# note that we can't directly pass i and chunks
def dispatch_fn(x, i, chunks):
return x.chunk(chunks=chunks)[i]
arg_future = DataProtoFuture(collect_fn=self.collect_fn,
dispatch_fn=partial(dispatch_fn, i=i, chunks=chunks),
futures=self.futures)
arg_future_lst.append(arg_future)
return arg_future_lst
def get(self):
output = ray.get(self.futures) # dp_size.
for o in output:
assert isinstance(o, DataProto)
output = self.collect_fn(output) # select dp, concat
if self.dispatch_fn is not None:
output = self.dispatch_fn(output) # split in batch dim, select using dp
return output
from verl.utils.torch_functional import allgather_dict_tensors
import torch.distributed
def all_gather_data_proto(data: DataProto, process_group):
# Note that this is an inplace operator just like torch.distributed.all_gather
group_size = torch.distributed.get_world_size(group=process_group)
assert isinstance(data, DataProto)
prev_device = data.batch.device
data.batch = data.batch.cuda(device=torch.cuda.current_device())
data.batch = allgather_dict_tensors(data.batch.contiguous(), size=group_size, group=process_group, dim=0)
data.batch = data.batch.to(prev_device)
# all gather non_tensor_batch
all_non_tensor_batch = [None for _ in range(group_size)]
torch.distributed.all_gather_object(all_non_tensor_batch, data.non_tensor_batch, group=process_group)
data.non_tensor_batch = {k: np.concatenate([d[k] for d in all_non_tensor_batch]) for k in data.non_tensor_batch}
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
# Note(haibin.lin): single_controller.__version__ is deprecated
with open(os.path.join(os.path.join(version_folder, os.pardir), 'version/version')) as f:
__version__ = f.read().strip()
from . import base
from .base import *
__all__ = base.__all__
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .worker import Worker
from .worker_group import WorkerGroup, ClassWithInitArgs, ResourcePool
__all__ = ['Worker', 'WorkerGroup', 'ClassWithInitArgs', 'ResourcePool']
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
from functools import wraps
from typing import Dict, List, Tuple
from types import FunctionType
from verl.protocol import DataProtoFuture
# here we add a magic number of avoid user-defined function already have this attribute
MAGIC_ATTR = 'attrs_3141562937'
class Dispatch(Enum):
RANK_ZERO = 0
ONE_TO_ALL = 1
ALL_TO_ALL = 2
MEGATRON_COMPUTE = 3
MEGATRON_PP_AS_DP = 4
MEGATRON_PP_ONLY = 5
MEGATRON_COMPUTE_PROTO = 6
MEGATRON_PP_AS_DP_PROTO = 7
DP_COMPUTE = 8
DP_COMPUTE_PROTO = 9
DP_COMPUTE_PROTO_WITH_FUNC = 10
DP_COMPUTE_METRIC = 11
class Execute(Enum):
ALL = 0
RANK_ZERO = 1
def _split_args_kwargs_data_proto(chunks, *args, **kwargs):
from verl.protocol import DataProto, DataProtoFuture
splitted_args = []
for arg in args:
assert isinstance(arg, (DataProto, DataProtoFuture))
splitted_args.append(arg.chunk(chunks=chunks))
splitted_kwargs = {}
for key, val in kwargs.items():
assert isinstance(val, (DataProto, DataProtoFuture))
splitted_kwargs[key] = val.chunk(chunks=chunks)
return splitted_args, splitted_kwargs
def dispatch_one_to_all(worker_group, *args, **kwargs):
args = tuple([arg] * worker_group.world_size for arg in args)
kwargs = {k: [v] * worker_group.world_size for k, v in kwargs.items()}
return args, kwargs
def dispatch_all_to_all(worker_group, *args, **kwargs):
return args, kwargs
def collect_all_to_all(worker_group, output):
return output
def dispatch_megatron_compute(worker_group, *args, **kwargs):
"""
User passes in dp data. The data is dispatched to all tp/pp ranks with the same dp
"""
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group,
MegatronWorkerGroup), f'worker_group must be MegatronWorkerGroup, Got {type(worker_group)}'
all_args = []
for arg in args:
assert isinstance(arg, (Tuple, List)) and len(arg) == worker_group.dp_size
transformed_args = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
transformed_args.append(arg[local_dp_rank])
all_args.append(transformed_args)
all_args = tuple(all_args)
all_kwargs = {}
for k, v in kwargs.items():
assert isinstance(v, (Tuple, List)) and len(v) == worker_group.dp_size
transformed_v = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
transformed_v.append(v[local_dp_rank])
all_kwargs[k] = transformed_v
return all_args, all_kwargs
def collect_megatron_compute(worker_group, output):
"""
Only collect the data from the tp=0 and pp=last and every dp ranks
"""
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output_in_dp = []
pp_size = worker_group.get_megatron_global_info().pp_size
for global_rank in range(worker_group.world_size):
local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
if local_rank_info.tp_rank == 0 and local_rank_info.pp_rank == pp_size - 1 and local_rank_info.cp_rank == 0:
output_in_dp.append(output[global_rank])
return output_in_dp
def dispatch_megatron_compute_data_proto(worker_group, *args, **kwargs):
"""
All the args and kwargs must be DataProto. The batch will be chunked by dp_size and passed to each rank
"""
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.dp_size, *args, **kwargs)
return dispatch_megatron_compute(worker_group, *splitted_args, **splitted_kwargs)
def _concat_data_proto_or_future(output: List):
from verl.protocol import DataProto, DataProtoFuture
import ray
# make sure all the elements in output has the same type
for o in output:
assert type(o) == type(output[0])
o = output[0]
if isinstance(o, DataProto):
return DataProto.concat(output)
elif isinstance(o, ray.ObjectRef):
return DataProtoFuture.concat(output)
else:
raise NotImplementedError
def collect_megatron_compute_data_proto(worker_group, output):
"""
Each output must be a DataProto. We concat the dim=0 of output
"""
from verl.protocol import DataProto
import ray
output = collect_megatron_compute(worker_group, output)
for o in output:
assert isinstance(o, (DataProto, ray.ObjectRef)), f"expecting {o} to be DataProto, but got {type(o)}"
return _concat_data_proto_or_future(output)
def dispatch_megatron_pp_as_dp(worker_group, *args, **kwargs):
"""
treat pp as dp.
"""
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
pp_size = worker_group.pp_size
dp_size = worker_group.dp_size
cp_size = worker_group.cp_size
pp_dp_cp_size = pp_size * dp_size * cp_size
all_args = []
for arg in args:
assert isinstance(arg, (List, Tuple)) and len(arg) == pp_dp_cp_size
transformed_args = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
local_pp_rank = worker_group.get_megatron_rank_info(rank=i).pp_rank
local_cp_rank = worker_group.get_megatron_rank_info(rank=i).cp_rank
# compute the rank in arg. Note that the order is dp then cp then pp
# Also note that the outputs within a pp group will be firstly allgathered, then only the output of pp0 will be collected.
# For pp=2 dp=4, a batch of data "ABCDEFGH" should be dispatched and collected in below order:
# dispatch: pp_allgther: collect:
# dp 0 1 2 3 dp 0 1 2 3
# pp +---------+ pp +-------------+
# 0 | A C E G | 0 | AB CD EF GH | ABCDEFGH
# 1 | B D F H | 1 | AB CD EF GH |
# +---------+ +-------------+
dp_cp_rank = local_cp_rank * dp_size + local_dp_rank
arg_rank = dp_cp_rank * pp_size + local_pp_rank
transformed_args.append(arg[arg_rank])
all_args.append(transformed_args)
all_args = tuple(all_args)
all_kwargs = {}
for k, v in kwargs.items():
assert isinstance(v, (List, Tuple)) and len(v) == pp_dp_cp_size, f'expect len(v)=={pp_dp_cp_size}, got {len(v)}'
transformed_v = []
for i in range(worker_group.world_size):
local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
local_pp_rank = worker_group.get_megatron_rank_info(rank=i).pp_rank
local_cp_rank = worker_group.get_megatron_rank_info(rank=i).cp_rank
# compute the rank in arg. Note that the order is dp then cp then pp
dp_cp_rank = local_cp_rank * dp_size + local_dp_rank
arg_rank = dp_cp_rank * pp_size + local_pp_rank
transformed_v.append(v[arg_rank])
all_kwargs[k] = transformed_v
return all_args, all_kwargs
def collect_megatron_pp_as_dp(worker_group, output):
"""
treat pp as dp. Only collect data on tp=0
"""
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output_in_dp = []
for global_rank in range(worker_group.world_size):
local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
if local_rank_info.tp_rank == 0:
output_in_dp.append(output[global_rank])
return output_in_dp
def collect_megatron_pp_only(worker_group, output):
"""
Only collect output of megatron pp. This is useful when examine weight names as they are identical in tp/dp
"""
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output_in_pp = []
for global_rank in range(worker_group.world_size):
local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
if local_rank_info.tp_rank == 0 and local_rank_info.dp_rank == 0:
output_in_pp.append(output[global_rank])
return output_in_pp
def dispatch_megatron_pp_as_dp_data_proto(worker_group, *args, **kwargs):
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
pp_dp_cp_size = worker_group.dp_size * worker_group.pp_size * worker_group.cp_size
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(pp_dp_cp_size, *args, **kwargs)
ret = dispatch_megatron_pp_as_dp(worker_group, *splitted_args, **splitted_kwargs)
return ret
def collect_megatron_pp_as_dp_data_proto(worker_group, output):
from verl.protocol import DataProto
from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
assert isinstance(worker_group, MegatronWorkerGroup)
output = collect_megatron_pp_as_dp(worker_group, output)
return _concat_data_proto_or_future(output)
def dispatch_dp_compute(worker_group, *args, **kwargs):
from verl.single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
for arg in args:
assert isinstance(arg, (Tuple, List)) and len(arg) == worker_group.world_size
for k, v in kwargs.items():
assert isinstance(v, (Tuple, List)) and len(v) == worker_group.world_size
return args, kwargs
def collect_dp_compute(worker_group, output):
from verl.single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
assert len(output) == worker_group.world_size
return output
def dispatch_dp_compute_data_proto(worker_group, *args, **kwargs):
from verl.single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args, **kwargs)
return splitted_args, splitted_kwargs
def dispatch_dp_compute_data_proto_with_func(worker_group, *args, **kwargs):
from verl.single_controller.base.worker_group import WorkerGroup
assert isinstance(worker_group, WorkerGroup)
assert type(args[0]) == FunctionType # NOTE: The first one args is a function!
splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args[1:], **kwargs)
splitted_args_with_func = [[args[0]] * worker_group.world_size] + splitted_args
return splitted_args_with_func, splitted_kwargs
def collect_dp_compute_data_proto(worker_group, output):
from verl.protocol import DataProto
import ray
for o in output:
assert isinstance(o, (DataProto, ray.ObjectRef)), f"expecting {o} to be DataProto, but got {type(o)}"
output = collect_dp_compute(worker_group, output)
return _concat_data_proto_or_future(output)
def get_predefined_dispatch_fn(dispatch_mode):
predefined_dispatch_mode_fn = {
Dispatch.ONE_TO_ALL: {
'dispatch_fn': dispatch_one_to_all,
'collect_fn': collect_all_to_all,
},
Dispatch.ALL_TO_ALL: {
'dispatch_fn': dispatch_all_to_all,
'collect_fn': collect_all_to_all,
},
Dispatch.MEGATRON_COMPUTE: {
'dispatch_fn': dispatch_megatron_compute,
'collect_fn': collect_megatron_compute,
},
Dispatch.MEGATRON_PP_AS_DP: {
'dispatch_fn': dispatch_megatron_pp_as_dp,
'collect_fn': collect_megatron_pp_as_dp,
},
Dispatch.MEGATRON_PP_ONLY: {
'dispatch_fn': dispatch_one_to_all,
'collect_fn': collect_megatron_pp_only
},
Dispatch.MEGATRON_COMPUTE_PROTO: {
'dispatch_fn': dispatch_megatron_compute_data_proto,
'collect_fn': collect_megatron_compute_data_proto
},
Dispatch.MEGATRON_PP_AS_DP_PROTO: {
'dispatch_fn': dispatch_megatron_pp_as_dp_data_proto,
'collect_fn': collect_megatron_pp_as_dp_data_proto
},
Dispatch.DP_COMPUTE: {
'dispatch_fn': dispatch_dp_compute,
'collect_fn': collect_dp_compute
},
Dispatch.DP_COMPUTE_PROTO: {
'dispatch_fn': dispatch_dp_compute_data_proto,
'collect_fn': collect_dp_compute_data_proto
},
Dispatch.DP_COMPUTE_PROTO_WITH_FUNC: {
'dispatch_fn': dispatch_dp_compute_data_proto_with_func,
'collect_fn': collect_dp_compute_data_proto
},
Dispatch.DP_COMPUTE_METRIC: {
'dispatch_fn': dispatch_dp_compute_data_proto,
'collect_fn': collect_dp_compute
}
}
return predefined_dispatch_mode_fn[dispatch_mode]
def get_predefined_execute_fn(execute_mode):
"""
Note that here we only asks execute_all and execute_rank_zero to be implemented
Leave the choice of how these two functions handle argument 'blocking' to users
"""
predefined_execute_mode_fn = {
Execute.ALL: {
'execute_fn_name': 'execute_all'
},
Execute.RANK_ZERO: {
'execute_fn_name': 'execute_rank_zero'
}
}
return predefined_execute_mode_fn[execute_mode]
def _check_dispatch_mode(dispatch_mode):
assert isinstance(dispatch_mode,
(Dispatch, Dict)), f'dispatch_mode must be a Dispatch or a Dict. Got {dispatch_mode}'
if isinstance(dispatch_mode, Dict):
necessary_keys = ['dispatch_fn', 'collect_fn']
for key in necessary_keys:
assert key in dispatch_mode, f'key {key} should be in dispatch_mode if it is a dictionary'
def _check_execute_mode(execute_mode):
assert isinstance(execute_mode, Execute), f'execute_mode must be a Execute. Got {execute_mode}'
def _materialize_futures(*args, **kwargs):
new_args = []
for arg in args:
if isinstance(arg, DataProtoFuture):
arg = arg.get()
# add more type to materialize
new_args.append(arg)
for k, v in kwargs.items():
if isinstance(v, DataProtoFuture):
kwargs[k] = v.get()
new_args = tuple(new_args)
return new_args, kwargs
def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocking=True, materialize_futures=True):
_check_dispatch_mode(dispatch_mode=dispatch_mode)
_check_execute_mode(execute_mode=execute_mode)
def decorator(func):
@wraps(func)
def inner(*args, **kwargs):
if materialize_futures:
args, kwargs = _materialize_futures(*args, **kwargs)
return func(*args, **kwargs)
attrs = {'dispatch_mode': dispatch_mode, 'execute_mode': execute_mode, 'blocking': blocking}
setattr(inner, MAGIC_ATTR, attrs)
return inner
return decorator
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from verl.single_controller.base.worker import Worker, DistRankInfo, DistGlobalInfo
class MegatronWorker(Worker):
def __init__(self, cuda_visible_devices=None) -> None:
super().__init__(cuda_visible_devices)
def get_megatron_global_info(self):
from megatron.core import parallel_state as mpu
tp_size = mpu.get_tensor_model_parallel_world_size()
dp_size = mpu.get_data_parallel_world_size()
pp_size = mpu.get_pipeline_model_parallel_world_size()
cp_size = mpu.get_context_parallel_world_size()
info = DistGlobalInfo(tp_size=tp_size, dp_size=dp_size, pp_size=pp_size, cp_size=cp_size)
return info
def get_megatron_rank_info(self):
from megatron.core import parallel_state as mpu
tp_rank = mpu.get_tensor_model_parallel_rank()
dp_rank = mpu.get_data_parallel_rank()
pp_rank = mpu.get_pipeline_model_parallel_rank()
cp_rank = mpu.get_context_parallel_rank()
info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank)
return info
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict
from .worker import DistRankInfo, DistGlobalInfo
from verl.single_controller.base import ResourcePool, WorkerGroup
class MegatronWorkerGroup(WorkerGroup):
def __init__(self, resource_pool: ResourcePool, **kwargs):
super().__init__(resource_pool=resource_pool, **kwargs)
self._megatron_rank_info = None
self._megatron_global_info: DistGlobalInfo = None
def init_megatron(self, default_megatron_kwargs: Dict = None):
raise NotImplementedError(f"MegatronWorkerGroup.init_megatron should be overwritten")
def get_megatron_rank_info(self, rank: int) -> DistRankInfo:
assert 0 <= rank < self.world_size, f'rank must be from [0, world_size), Got {rank}'
return self._megatron_rank_info[rank]
@property
def tp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.tp_size
@property
def dp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.dp_size
@property
def pp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.pp_size
@property
def cp_size(self):
assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
return self._megatron_global_info.cp_size
def get_megatron_global_info(self):
return self._megatron_global_info
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ray
@ray.remote
class WorkerGroupRegisterCenter:
def __init__(self, rank_zero_info):
self.rank_zero_info = rank_zero_info
def get_rank_zero_info(self):
return self.rank_zero_info
def create_worker_group_register_center(name, info):
return WorkerGroupRegisterCenter.options(name=name).remote(info)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
the class for Worker
"""
import os
import socket
from dataclasses import dataclass
from .decorator import register, Dispatch, Execute
@dataclass
class DistRankInfo:
tp_rank: int
dp_rank: int
pp_rank: int
cp_rank: int
@dataclass
class DistGlobalInfo:
tp_size: int
dp_size: int
pp_size: int
cp_size: int
class WorkerHelper:
def _get_node_ip(self):
def get_node_ip_by_sdk():
if os.getenv("WG_BACKEND", None) == "ray":
import ray
return ray._private.services.get_node_ip_address()
else:
raise NotImplementedError("WG_BACKEND now just support ray mode.")
host_ipv4 = os.getenv("MY_HOST_IP", None)
host_ipv6 = os.getenv("MY_HOST_IPV6", None)
host_ip_by_env = host_ipv4 or host_ipv6
host_ip_by_sdk = get_node_ip_by_sdk()
host_ip = host_ip_by_env or host_ip_by_sdk
return host_ip
def _get_free_port(self):
with socket.socket() as sock:
sock.bind(('', 0))
return sock.getsockname()[1]
def get_availale_master_addr_port(self):
return self._get_node_ip(), str(self._get_free_port())
def _get_pid(self):
return
class WorkerMeta:
keys = [
"WORLD_SIZE", "RANK", "LOCAL_WORLD_SIZE", "LOCAL_RANK", "MASTER_ADDR", "MASTER_PORT", "CUDA_VISIBLE_DEVICES"
]
def __init__(self, store) -> None:
self._store = store
def to_dict(self):
return {f"_{key.lower()}": self._store.get(f"_{key.lower()}", None) for key in WorkerMeta.keys}
# we assume that in each WorkerGroup, there is a Master Worker
class Worker(WorkerHelper):
"""A (distributed) worker."""
def __new__(cls, *args, **kwargs):
instance = super().__new__(cls)
# note that here we use int to distinguish
disable_worker_init = int(os.environ.get('DISABLE_WORKER_INIT', 0))
if disable_worker_init:
return instance
rank = os.environ.get("RANK", None)
worker_group_prefix = os.environ.get("WG_PREFIX", None)
# when decorator @ray.remote applies, __new__ will be called while we don't want to apply _configure_before_init
if None not in [rank, worker_group_prefix] and 'ActorClass(' not in cls.__name__:
instance._configure_before_init(f"{worker_group_prefix}_register_center", int(rank))
return instance
def _configure_before_init(self, register_center_name: str, rank: int):
assert isinstance(rank, int), f"rank must be int, instead of {type(rank)}"
if rank == 0:
master_addr, master_port = self.get_availale_master_addr_port()
rank_zero_info = {
"MASTER_ADDR": master_addr,
"MASTER_PORT": master_port,
}
if os.getenv("WG_BACKEND", None) == "ray":
from verl.single_controller.base.register_center.ray import create_worker_group_register_center
self.register_center = create_worker_group_register_center(name=register_center_name,
info=rank_zero_info)
os.environ.update(rank_zero_info)
def __init__(self, cuda_visible_devices=None) -> None:
# construct a meta from envrionment variable. Note that the import must be inside the class because it is executed remotely
import os
###
# [SUPPORT AMD: torch]
import torch
###
###
# [SUPPORT AMD: torch]
if "AMD" in torch.cuda.get_device_name():
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ.get('ROCR_VISIBLE_DEVICES')
os.environ['LOCAL_RANK'] = os.environ.get('RAY_LOCAL_RANK')
###
world_size = int(os.environ['WORLD_SIZE'])
rank = int(os.environ['RANK'])
self._rank = rank
self._world_size = world_size
master_addr = os.environ["MASTER_ADDR"]
master_port = os.environ["MASTER_PORT"]
local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", "1"))
local_rank = int(os.getenv("LOCAL_RANK", "0"))
###
# [SUPPORT AMD: torch]
if "AMD" in torch.cuda.get_device_name():
self.local_rank = int(os.environ['LOCAL_RANK'])
###
###
# [SUPPORT AMD: torch]
if "AMD" in torch.cuda.get_device_name():
cuda_visible_devices = str(local_rank)
###
store = {
'_world_size': world_size,
'_rank': rank,
'_local_world_size': local_world_size,
'_local_rank': local_rank,
'_master_addr': master_addr,
'_master_port': master_port
}
if cuda_visible_devices is not None:
store['_cuda_visible_devices'] = cuda_visible_devices
meta = WorkerMeta(store=store)
self._configure_with_meta(meta=meta)
###
# [SUPPORT AMD: torch]
# torch.cuda.set_device(local_rank)
if "AMD" in torch.cuda.get_device_name():
torch.cuda.set_device(int(cuda_visible_devices))
###
def _configure_with_meta(self, meta: WorkerMeta):
"""
This function should only be called inside by WorkerGroup
"""
assert isinstance(meta, WorkerMeta)
self.__dict__.update(meta.to_dict()) # this is hacky
# print(f"__dict__: {self.__dict__}")
for key in WorkerMeta.keys:
val = self.__dict__.get(f"_{key.lower()}", None)
if val is not None:
# print(f"set {key} to {val}")
os.environ[key] = str(val)
os.environ["REDIS_STORE_SERVER_HOST"] = str(self._master_addr).replace("[", "").replace(
"]", "") if self._master_addr else ""
def get_master_addr_port(self):
return self._master_addr, self._master_port
def get_cuda_visible_devices(self):
import os
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
return cuda_visible_devices
@property
def world_size(self):
return self._world_size
@property
def rank(self):
return self._rank
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO_WITH_FUNC)
def execute_with_func_generator(self, func, *args, **kwargs):
ret_proto = func(self, *args, **kwargs)
return ret_proto
@register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
def execute_func_rank_zero(self, func, *args, **kwargs):
result = func(*args, **kwargs)
return result
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
the class of WorkerGroup
"""
import logging
import threading
import signal
import time
from typing import List, Any, Callable, Dict
from .decorator import MAGIC_ATTR, Dispatch, get_predefined_dispatch_fn, get_predefined_execute_fn
class ResourcePool:
"""The resource pool with meta info such as world_size."""
def __init__(self, process_on_nodes=None, max_colocate_count: int = 10, n_gpus_per_node=8) -> None:
if process_on_nodes is None:
process_on_nodes = []
self._store = process_on_nodes
self.max_colocate_count = max_colocate_count
self.n_gpus_per_node = n_gpus_per_node # this is left for future huawei GPU that contains 16 GPUs per node
def add_node(self, process_count):
self._store.append(process_count)
@property
def world_size(self):
return sum(self._store)
def __call__(self) -> Any:
return self._store
@property
def store(self):
return self._store
def local_world_size_list(self) -> List[int]:
nested_local_world_size_list = [
[local_world_size for _ in range(local_world_size)] for local_world_size in self._store
]
return [item for row in nested_local_world_size_list for item in row]
def local_rank_list(self) -> List[int]:
nested_local_rank_list = [[i for i in range(local_world_size)] for local_world_size in self._store]
return [item for row in nested_local_rank_list for item in row]
class ClassWithInitArgs:
"""
This class stores a class constructor and the args/kwargs to construct the class.
It is used to instantiate the remote class.
"""
def __init__(self, cls, *args, **kwargs) -> None:
self.cls = cls
self.args = args
self.kwargs = kwargs
# def add_arg(self, arg):
# self.args += (arg,)
# def add_kwarg(self, key, value):
# self.kwargs[key] = value
def __call__(self) -> Any:
return self.cls(*self.args, **self.kwargs)
def check_workers_alive(workers: List, is_alive: Callable, gap_time: float = 1) -> None:
import time
while True:
for worker in workers:
if not is_alive(worker):
logging.warning(f"worker {worker} is not alive" + " sending signal to main thread")
signal.raise_signal(signal.SIGABRT)
time.sleep(gap_time)
class WorkerGroup:
"""A group of workers"""
def __init__(self, resource_pool: ResourcePool, **kwargs) -> None:
self._is_init_with_detached_workers = True if resource_pool is None else False
if resource_pool is not None:
# handle the case when WorkGroup is attached to an existing one
self._procecss_dispatch_config = resource_pool()
else:
self._procecss_dispatch_config = None
self._workers = []
self._worker_names = []
self._master_addr = None
self._master_port = None
self._checker_thread: threading.Thread = None
def _is_worker_alive(self, worker):
raise NotImplementedError(f"WorkerGroup._is_worker_alive called, should be implemented in derived class.")
def _block_until_all_workers_alive(self) -> None:
while True:
all_state = [self._is_worker_alive(worker) for worker in self._workers]
if False in all_state:
time.sleep(1)
else:
break
def start_worker_aliveness_check(self, every_n_seconds=1) -> None:
# before starting checking worker aliveness, make sure all workers are already alive
self._block_until_all_workers_alive()
self._checker_thread = threading.Thread(target=check_workers_alive,
args=(self._workers, self._is_worker_alive, every_n_seconds))
self._checker_thread.start()
@property
def world_size(self):
return len(self._workers)
# execute_all_async and execute_rank_zero_async should be implemented by RayWorkerGroup, TorchRPCWorkerGroup,
# MegatronWorkerGroup, XperfWorkerGroup should skip
def _bind_worker_method(self, user_defined_cls, func_generator):
"""
Bind the worker method to the WorkerGroup
"""
for method_name in dir(user_defined_cls):
try:
method = getattr(user_defined_cls, method_name)
assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
except Exception as e:
# if it is a property, it will fail because Class doesn't have instance property
continue
if hasattr(method, MAGIC_ATTR):
# this method is decorated by register
attribute = getattr(method, MAGIC_ATTR)
assert isinstance(attribute, Dict), f'attribute must be a dictionary. Got {type(attribute)}'
assert 'dispatch_mode' in attribute, f'attribute must contain dispatch_mode in its key'
dispatch_mode = attribute['dispatch_mode']
execute_mode = attribute['execute_mode']
blocking = attribute['blocking']
# get dispatch fn
if isinstance(dispatch_mode, Dispatch):
# get default dispatch fn
fn = get_predefined_dispatch_fn(dispatch_mode=dispatch_mode)
dispatch_fn = fn['dispatch_fn']
collect_fn = fn['collect_fn']
else:
assert isinstance(dispatch_mode, dict)
assert 'dispatch_fn' in dispatch_mode
assert 'collect_fn' in dispatch_mode
dispatch_fn = dispatch_mode['dispatch_fn']
collect_fn = dispatch_mode['collect_fn']
# get execute_fn_name
execute_mode = get_predefined_execute_fn(execute_mode=execute_mode)
wg_execute_fn_name = execute_mode['execute_fn_name']
# get execute_fn from string
try:
execute_fn = getattr(self, wg_execute_fn_name)
assert callable(execute_fn), 'execute_fn must be callable'
except Exception as e:
print(f'execute_fn {wg_execute_fn_name} is invalid')
raise
# bind a new method to the RayWorkerGroup
func = func_generator(self,
method_name,
dispatch_fn=dispatch_fn,
collect_fn=collect_fn,
execute_fn=execute_fn,
blocking=blocking)
try:
setattr(self, method_name, func)
except Exception as e:
raise ValueError(f'Fail to set method_name {method_name}')
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, create_colocated_worker_cls
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment