"pytorch_pretrained_bert/tokenization_bert.py" did not exist on "751beb9e73c39065ea8b76fbfead9007a70a054f"
Commit 7f6cc211 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2874 failed with stages
in 0 seconds
# requirements.txt records the full set of dependencies for development
accelerate
codetiming
datasets
dill
hydra-core
numpy<2.0.0
pandas
peft
pyarrow>=15.0.0
pybind11
pylatexenc
tensordict>=0.8.0,<=0.9.1,!=0.9.0
transformers==4.52.4
ray==2.46.0
wandb
mathruler
torchdata
einops
qwen_vl_utils
torchvision==0.20.1
# requirements.txt records the full set of dependencies for development
accelerate
codetiming
datasets
dill
flash-attn
hydra-core
liger-kernel
numpy<2.0.0
pandas
peft
pyarrow>=19.0.0
pybind11
pylatexenc
pre-commit
ray[default]
tensordict>=0.8.0,<=0.9.1,!=0.9.0
torchdata
transformers
# vllm==0.8.4
wandb
packaging>=20.0
uvicorn
fastapi
latex2sympy2_extended
math_verify
# requirements.txt records the full set of dependencies for development
accelerate
codetiming
datasets
dill
flash-attn
hydra-core
numpy<2.0.0
pandas
peft
pyarrow>=19.0.0
pybind11
pylatexenc
ray[default]>=2.10
tensordict>=0.8.0,<=0.9.1,!=0.9.0
torchdata
torchvision
transformers
wandb
sglang[all]==0.4.6.post5
torch-memory-saver>=0.0.5
huggingface_hub
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
#export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
#unset ROCR_VISIBLE_DEVICES
# PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
# data.train_files=/data/gsm8k/train.parquet \
# data.val_files=/data/gsm8k/test.parquet \
# data.train_batch_size=256 \
# data.max_prompt_length=512 \
# data.max_response_length=256 \
# actor_rollout_ref.model.path=/model/Qwen2.5-0.5B-Instruct \
# actor_rollout_ref.actor.optim.lr=1e-6 \
# actor_rollout_ref.actor.ppo_mini_batch_size=64 \
# actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
# actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
# actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
# actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
# actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
# critic.optim.lr=1e-5 \
# critic.model.path=/model/Qwen2.5-0.5B-Instruct \
# critic.ppo_micro_batch_size_per_gpu=4 \
# algorithm.kl_ctrl.kl_coef=0.001 \
# trainer.logger=console \
# trainer.val_before_train=False \
# trainer.n_gpus_per_node=1 \
# trainer.nnodes=1 \
# trainer.save_freq=10 \
# trainer.test_freq=10 \
# trainer.total_epochs=15 2>&1 | tee verl_demo.log
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/data/gsm8k/train.parquet \
data.val_files=/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=/model/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='test' \
trainer.experiment_name='qwen2_5_0_5b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.default_local_dir=/verl/qwen2_5_14b_verl_grpo_8 \
trainer.nnodes=1 \
trainer.save_freq=5 \
trainer.test_freq=5 \
trainer.total_epochs=1
# export CUDA_VISIBLE_DEVICES=0
# export RAY_DISABLE_GPU_AUTODETECTION=1
# PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
# data.train_files=/data/gsm8k/train.parquet \
# data.val_files=/data/gsm8k/test.parquet \
# data.train_batch_size=256 \
# data.max_prompt_length=512 \
# data.max_response_length=256 \
# actor_rollout_ref.model.path=/model/Qwen2.5-0.5B-Instruct \
# actor_rollout_ref.actor.optim.lr=1e-6 \
# actor_rollout_ref.actor.ppo_mini_batch_size=64 \
# actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
# actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
# actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
# actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
# actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
# critic.optim.lr=1e-5 \
# critic.model.path=/model/Qwen2.5-0.5B-Instruct \
# critic.ppo_micro_batch_size_per_gpu=4 \
# algorithm.kl_ctrl.kl_coef=0.001 \
# trainer.logger=console \
# trainer.val_before_train=False \
# trainer.n_gpus_per_node=1 \
# trainer.nnodes=1 \
# trainer.save_freq=10 \
# trainer.test_freq=10 \
# trainer.total_epochs=15 2>&1 | tee verl_demo.log
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2025 Bytedance Ltd. and/or its affiliates
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import warnings
from contextlib import contextmanager
from importlib.metadata import version
from typing import Any, Callable, ContextManager, Optional
import numpy as np
import torch
import torch.distributed as dist
from accelerate import init_empty_weights
from megatron.core import dist_checkpointing
from megatron.core import parallel_state as mpu
from megatron.core.dist_checkpointing.mapping import ShardedTensor
from megatron.core.dist_checkpointing.serialization import StrictHandling
from megatron.core.models.gpt.gpt_model import ModelType
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from packaging.version import Version
from transformers import AutoConfig
from verl.model_merger.megatron_model_merger import get_dynamic_pipeline_shards
from verl.models.mcore import hf_to_mcore_config
from verl.utils.device import get_device_name, get_torch_device
from verl.utils.megatron_utils import get_model
def _init_args():
"""
Examples:
1. single rank conversion for any model:
> python converter_hf_to_mcore.py --hf_model_path %{hf_model} --output_path ${output_path}
2. distributed conversion for DeepseekV3 671B:
> torchrun --nproc_per_node 1 --nnodes 4 --node_rank ${RANK} converter_hf_to_mcore.py \
--hf_model_path %{hf_model} --output_path ${output_path}
"""
parser = argparse.ArgumentParser()
parser.add_argument("--hf_model_path", type=str, required=True, help="The path for the huggingface model")
parser.add_argument("--output_path", type=str, required=True, help="The path for the output mcore model")
parser.add_argument("--use_cpu_initialization", action="store_true", help="Whether to use cpu initialization")
parser.add_argument("--test", action="store_true", help="Whether to test the conversion")
parser.add_argument("--trust_remote_code", action="store_true", help="Whether to trust remote code")
args = parser.parse_args()
return args
def test_conversion(megatron_model_provider, tfconfig, output_path, model):
########### test ###########
# load model
model_test = get_model(
model_provider_func=megatron_model_provider,
model_type=ModelType.encoder_or_decoder,
wrap_with_ddp=True,
transformer_config=tfconfig,
)
ref_state_dict = model_test[0].module.sharded_state_dict()
dist_checkpointing.load(ref_state_dict, output_path, strict=StrictHandling.ASSUME_OK_UNEXPECTED)
dut_state_dict = model[0].module.state_dict()
for name in dut_state_dict.keys():
if dut_state_dict[name] is None:
print(f"[Warning] {name} is none in dut_state_dict")
continue
dut_data = dut_state_dict[name].data
if name in ref_state_dict:
ref_data = ref_state_dict[name]
if isinstance(ref_data, ShardedTensor):
ref_data = ref_data.data.view(ref_data.local_shape)
else:
ref_data = ref_data.data
assert dut_data.shape == ref_data.shape, f"{name=} {dut_data.shape=} {ref_data.shape=}"
assert (dut_data == ref_data).all(), f"{name} is not equal"
print(f"{name} is equal")
else:
print(f"[Warning] {name} is not in ref_state_dict")
for name in ref_state_dict.keys():
if ref_state_dict[name] is None:
print(f"[Warning] {name} is none in ref_state_dict")
continue
ref_data = ref_state_dict[name]
if isinstance(ref_data, ShardedTensor):
ref_data = ref_data.data.view(ref_data.local_shape)
else:
ref_data = ref_data.data
if name in dut_state_dict:
dut_data = dut_state_dict[name].data
assert dut_data.shape == ref_data.shape, f"{name=} {dut_data.shape=} {ref_data.shape=}"
assert (dut_data == ref_data).all(), f"{name} is not equal"
print(f"{name} is equal")
else:
print(f"[Warning] {name} is not in dut_state_dict")
print("Conversion test passed!")
@torch.inference_mode()
def convert_checkpoint_from_transformers_to_megatron(
hf_model, model, hf_config, layer_start_end: Optional[tuple[int, int]] = None
):
if layer_start_end is None:
layer_start_end = (0, len(model.decoder.layers))
layer_start, layer_end = layer_start_end
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
numel = 0
num_attention_heads = hf_config.num_attention_heads
num_key_value_heads = hf_config.num_key_value_heads
hidden_dim = hf_config.hidden_size
head_dim = getattr(hf_config, "head_dim", hidden_dim // num_attention_heads)
if num_attention_heads != num_key_value_heads:
print("[WARNING] Converting GQA model")
has_qkv_bias = getattr(hf_config, "qkv_bias", False) or getattr(hf_config, "attention_bias", False)
has_share_expert = getattr(hf_config, "shared_expert_intermediate_size", None)
if pp_rank == 0:
numel += safe_copy(hf_model.model.embed_tokens.weight, model.embedding.word_embeddings.weight)
assert len(model.decoder.layers) == (layer_end - layer_start), (
f"Expected {len(model.decoder.layers)} layers, but got {layer_end - layer_start}"
)
for layer_idx, (layer, hf_layer) in enumerate(
zip(model.decoder.layers, hf_model.model.layers[layer_start:layer_end], strict=True)
):
global_layer_idx = layer_idx + layer_start
numel_cur = numel
numel += safe_copy(hf_layer.input_layernorm.weight, layer.self_attention.linear_qkv.layer_norm_weight)
q = hf_layer.self_attn.q_proj.weight.view(
[num_key_value_heads, head_dim * num_attention_heads // num_key_value_heads, -1]
)
k = hf_layer.self_attn.k_proj.weight.view([num_key_value_heads, head_dim, -1])
v = hf_layer.self_attn.v_proj.weight.view([num_key_value_heads, head_dim, -1])
qkv = torch.cat([q, k, v], dim=1).view(-1, hidden_dim).contiguous()
numel += safe_copy(qkv, layer.self_attention.linear_qkv.weight)
if has_qkv_bias:
q_bias = hf_layer.self_attn.q_proj.bias.view([num_key_value_heads, -1])
k_bias = hf_layer.self_attn.k_proj.bias.view([num_key_value_heads, -1])
v_bias = hf_layer.self_attn.v_proj.bias.view([num_key_value_heads, -1])
qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view(-1).contiguous()
numel += safe_copy(qkv_bias, layer.self_attention.linear_qkv.bias)
if hasattr(hf_layer.self_attn, "q_norm"):
numel += safe_copy(hf_layer.self_attn.q_norm.weight.data, layer.self_attention.q_layernorm.weight)
numel += safe_copy(hf_layer.self_attn.k_norm.weight.data, layer.self_attention.k_layernorm.weight)
numel += safe_copy(hf_layer.self_attn.o_proj.weight, layer.self_attention.linear_proj.weight)
numel += safe_copy(hf_layer.post_attention_layernorm.weight, layer.pre_mlp_layernorm.weight)
numel += safe_copy(hf_layer.mlp.gate.weight, layer.mlp.router.weight)
for idx, hf_expert in enumerate(hf_layer.mlp.experts):
fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
numel += safe_copy(fc1_weight, layer.mlp.experts.linear_fc1._parameters[f"weight{idx}"])
numel += safe_copy(hf_expert.down_proj.weight, layer.mlp.experts.linear_fc2._parameters[f"weight{idx}"])
if has_share_expert:
numel += safe_copy(hf_layer.mlp.shared_expert_gate.weight, layer.mlp.shared_experts.gate_weight)
shared_fc1_weight = torch.cat(
[hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight]
)
numel += safe_copy(shared_fc1_weight, layer.mlp.shared_experts.linear_fc1.weight)
numel += safe_copy(hf_layer.mlp.shared_expert.down_proj.weight, layer.mlp.shared_experts.linear_fc2.weight)
print(f"{pp_rank=} {global_layer_idx=} {layer_idx=} {numel=} numel this layer={numel - numel_cur}")
if pp_rank == pp_size - 1:
numel += safe_copy(hf_model.model.norm.weight, model.decoder.final_layernorm.weight)
numel += safe_copy(hf_model.lm_head.weight, model.output_layer.weight)
return numel
def safe_copy(
src_tensor: torch.Tensor,
dst_tensor: torch.Tensor,
skip_dtype_assert: bool = False,
):
if not skip_dtype_assert:
if src_tensor.dtype != dst_tensor.dtype:
raise ValueError(f"Get source dtype {src_tensor.dtype}, but target dtype {dst_tensor.dtype}")
assert src_tensor.shape == dst_tensor.shape
dst_tensor.data.copy_(src_tensor.data)
return src_tensor.numel()
@torch.inference_mode()
def convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hfmodel, mgmodel, hf_config):
mgmodel = mgmodel.bfloat16()
hfmodel = hfmodel.bfloat16()
num_attention_heads = hf_config.num_attention_heads
num_query_groups = hf_config.num_key_value_heads
hidden_size = hf_config.hidden_size
head_dim = hidden_size // num_attention_heads
# 1. vision model
if Version(version("transformers")) < Version("4.52.0"):
print("Using transformers < 4.52 API to load vision model")
hfvision = hfmodel.visual
else:
hfvision = hfmodel.model.visual
mgvision = mgmodel.vision_model
vision_hidden_size = mgvision.config.hidden_size
vision_num_query_groups = mgvision.config.num_query_groups
vision_head_dim = vision_hidden_size // mgvision.config.num_attention_heads
copied_numel = 0
safe_copy(hfvision.rotary_pos_emb.inv_freq, mgvision.rotary_pos_emb.inv_freq)
copied_numel += safe_copy(hfvision.patch_embed.proj.weight, mgvision.patch_embed.proj.weight)
for hfblock, mgblock in zip(hfvision.blocks, mgvision.decoder.layers, strict=True):
# norm1 --> linear_qkv.norm
copied_numel += safe_copy(hfblock.norm1.weight, mgblock.self_attention.linear_qkv.layer_norm_weight)
# norm2 --> mlp.linear_fc1.norm
copied_numel += safe_copy(hfblock.norm2.weight, mgblock.mlp.linear_fc1.layer_norm_weight)
# qkv --> self_attention.linear_qkv
converted_weight = (
hfblock.attn.qkv.weight.view(3, vision_num_query_groups, -1, vision_head_dim, vision_hidden_size)
.transpose(0, 1)
.flatten(1, 2)
.reshape(-1, vision_hidden_size)
.contiguous()
)
copied_numel += safe_copy(converted_weight, mgblock.self_attention.linear_qkv.weight)
converted_bias = (
hfblock.attn.qkv.bias.view(3, vision_num_query_groups, -1)
.transpose(0, 1)
.flatten(1, 2)
.view(-1)
.contiguous()
)
copied_numel += safe_copy(converted_bias, mgblock.self_attention.linear_qkv.bias)
# proj --> self_attention.linear_proj
copied_numel += safe_copy(hfblock.attn.proj.weight, mgblock.self_attention.linear_proj.weight)
copied_numel += safe_copy(hfblock.attn.proj.bias, mgblock.self_attention.linear_proj.bias)
# mlp --> mlp: gate
fc1_weight = torch.cat([hfblock.mlp.gate_proj.weight, hfblock.mlp.up_proj.weight])
fc1_bias = torch.cat([hfblock.mlp.gate_proj.bias, hfblock.mlp.up_proj.bias])
copied_numel += safe_copy(fc1_weight, mgblock.mlp.linear_fc1.weight)
copied_numel += safe_copy(fc1_bias, mgblock.mlp.linear_fc1.bias)
copied_numel += safe_copy(hfblock.mlp.down_proj.weight, mgblock.mlp.linear_fc2.weight)
copied_numel += safe_copy(hfblock.mlp.down_proj.bias, mgblock.mlp.linear_fc2.bias)
# 2. vision projector
hfprojector = hfvision.merger
mgprojector = mgvision.projection
copied_numel += safe_copy(hfprojector.ln_q.weight, mgvision.decoder.final_layernorm.weight)
copied_numel += safe_copy(hfprojector.mlp[0].weight, mgprojector.encoder.linear_fc1.weight)
copied_numel += safe_copy(hfprojector.mlp[0].bias, mgprojector.encoder.linear_fc1.bias)
copied_numel += safe_copy(hfprojector.mlp[2].weight, mgprojector.encoder.linear_fc2.weight)
copied_numel += safe_copy(hfprojector.mlp[2].bias, mgprojector.encoder.linear_fc2.bias)
n_params = sum([t.numel() for t in hfvision.state_dict().values()])
assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
# 3. llm [just Qwen2]
if Version(version("transformers")) < Version("4.52.0"):
print("Using transformers < 4.52 API to load llm")
hfllm = hfmodel.model
else:
hfllm = hfmodel.model.language_model
mgllm = mgmodel.language_model
copied_numel = 0
copied_numel += safe_copy(hfllm.embed_tokens.weight, mgllm.embedding.word_embeddings.weight)
layermaps = zip(mgllm.decoder.layers, hfllm.layers, strict=True)
for mglayer, hflayer in layermaps:
copied_numel += safe_copy(hflayer.input_layernorm.weight, mglayer.self_attention.linear_qkv.layer_norm_weight)
q_proj_weight = hflayer.self_attn.q_proj.weight.view(num_query_groups, -1, head_dim, hidden_size)
k_proj_weight = hflayer.self_attn.k_proj.weight.view(num_query_groups, -1, head_dim, hidden_size)
v_proj_weight = hflayer.self_attn.v_proj.weight.view(num_query_groups, -1, head_dim, hidden_size)
qkv_proj = torch.cat([q_proj_weight, k_proj_weight, v_proj_weight], dim=1).view(-1, hidden_size).contiguous()
copied_numel += safe_copy(qkv_proj, mglayer.self_attention.linear_qkv.weight)
q_proj_bias = hflayer.self_attn.q_proj.bias.view(num_query_groups, -1)
k_proj_bias = hflayer.self_attn.k_proj.bias.view(num_query_groups, -1)
v_proj_bias = hflayer.self_attn.v_proj.bias.view(num_query_groups, -1)
qkv_bias = torch.cat([q_proj_bias, k_proj_bias, v_proj_bias], dim=1).view(-1).contiguous()
copied_numel += safe_copy(qkv_bias, mglayer.self_attention.linear_qkv.bias)
copied_numel += safe_copy(hflayer.self_attn.o_proj.weight, mglayer.self_attention.linear_proj.weight)
fc1_weight = torch.cat([hflayer.mlp.gate_proj.weight, hflayer.mlp.up_proj.weight])
copied_numel += safe_copy(fc1_weight, mglayer.mlp.linear_fc1.weight)
copied_numel += safe_copy(hflayer.mlp.down_proj.weight, mglayer.mlp.linear_fc2.weight)
copied_numel += safe_copy(hflayer.post_attention_layernorm.weight, mglayer.mlp.linear_fc1.layer_norm_weight)
copied_numel += safe_copy(hfllm.norm.weight, mgllm.decoder.final_layernorm.weight)
if not hf_config.tie_word_embeddings:
safe_copy(hfmodel.lm_head.weight, mgllm.output_layer.weight)
n_params = sum([t.numel() for t in hfllm.state_dict().values()])
assert n_params == copied_numel, f"n_params={n_params} != copied_numel={copied_numel}"
@torch.inference_mode()
def convert_checkpoint_from_transformers_to_megatron_dpskv3(
hf_model,
model,
hf_config,
tfconfig,
layer_start_end: Optional[tuple[int, int]] = None,
):
warnings.warn("MTP model is not supported yet", stacklevel=2)
if layer_start_end is None:
layer_start_end = (0, len(model.decoder.layers))
layer_start, layer_end = layer_start_end
numel: int = 0
pp_rank = mpu.get_pipeline_model_parallel_rank()
pp_size = mpu.get_pipeline_model_parallel_world_size()
if pp_rank == 0:
numel += safe_copy(hf_model.model.embed_tokens.weight, model.embedding.word_embeddings.weight)
assert len(model.decoder.layers) == (layer_end - layer_start), (
f"Expected {len(model.decoder.layers)} layers, but got {layer_end - layer_start}"
)
for layer_idx, (layer, hf_layer) in enumerate(
zip(model.decoder.layers, hf_model.model.layers[layer_start:layer_end], strict=True)
):
global_layer_idx = layer_idx + layer_start
numel_cur: int = numel
numel += safe_copy(hf_layer.input_layernorm.weight, layer.input_layernorm.weight)
if hf_config.q_lora_rank is None:
numel += safe_copy(hf_layer.self_attn.q_proj.weight, layer.self_attention.linear_q_proj.weight)
else:
numel += safe_copy(hf_layer.self_attn.q_a_proj.weight, layer.self_attention.linear_q_down_proj.weight)
numel += safe_copy(hf_layer.self_attn.q_b_proj.weight, layer.self_attention.linear_q_up_proj.weight)
numel += safe_copy(
hf_layer.self_attn.q_a_layernorm.weight, layer.self_attention.linear_q_up_proj.layer_norm_weight
)
numel += safe_copy(
hf_layer.self_attn.kv_a_proj_with_mqa.weight, layer.self_attention.linear_kv_down_proj.weight
)
numel += safe_copy(hf_layer.self_attn.kv_b_proj.weight, layer.self_attention.linear_kv_up_proj.weight)
numel += safe_copy(
hf_layer.self_attn.kv_a_layernorm.weight, layer.self_attention.linear_kv_up_proj.layer_norm_weight
)
numel += safe_copy(hf_layer.self_attn.o_proj.weight, layer.self_attention.linear_proj.weight)
if not hasattr(layer.mlp, "router"):
numel += safe_copy(hf_layer.post_attention_layernorm.weight, layer.mlp.linear_fc1.layer_norm_weight)
numel += safe_copy(
torch.cat([hf_layer.mlp.gate_proj.weight, hf_layer.mlp.up_proj.weight]), layer.mlp.linear_fc1.weight
)
numel += safe_copy(hf_layer.mlp.down_proj.weight, layer.mlp.linear_fc2.weight)
else:
numel += safe_copy(hf_layer.mlp.gate.weight, layer.mlp.router.weight)
# NOTE: the e_score_correction_bias in mcore model will be initialized with bfloat16 and \
# recover to fp32 in the first forward. There is always a diff in the bias between two models (~0.3%)
numel += safe_copy(
hf_layer.mlp.gate.e_score_correction_bias, layer.mlp.router.expert_bias, skip_dtype_assert=True
)
if tfconfig.moe_grouped_gemm:
for i, hf_expert in enumerate(hf_layer.mlp.experts):
fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
linear_fc1_weighti = getattr(layer.mlp.experts.linear_fc1, "weight" + str(i))
numel += safe_copy(fc1_weight, linear_fc1_weighti)
linear_fc2_weighti = getattr(layer.mlp.experts.linear_fc2, "weight" + str(i))
numel += safe_copy(hf_expert.down_proj.weight, linear_fc2_weighti)
else:
for i, hf_expert in enumerate(hf_layer.mlp.experts):
expert = layer.mlp.experts.local_experts[i]
fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
numel += safe_copy(fc1_weight, expert.linear_fc1.weight)
numel += safe_copy(hf_expert.down_proj.weight, expert.linear_fc2.weight)
numel += safe_copy(hf_layer.post_attention_layernorm.weight, layer.pre_mlp_layernorm.weight)
shared_fc1_weight = torch.cat(
[hf_layer.mlp.shared_experts.gate_proj.weight, hf_layer.mlp.shared_experts.up_proj.weight]
)
numel += safe_copy(shared_fc1_weight, layer.mlp.shared_experts.linear_fc1.weight)
numel += safe_copy(hf_layer.mlp.shared_experts.down_proj.weight, layer.mlp.shared_experts.linear_fc2.weight)
print(f"{pp_rank=} {global_layer_idx=} {layer_idx=} {numel=} numel this layer={numel - numel_cur}")
assert numel - numel_cur == sum([i.numel() for i in hf_layer.state_dict().values()]), "numel mismatch"
if pp_rank == pp_size - 1:
numel += safe_copy(hf_model.model.norm.weight, model.decoder.final_layernorm.weight)
if not hf_config.tie_word_embeddings:
numel += safe_copy(hf_model.lm_head.weight, model.output_layer.weight)
print(f"{pp_rank=} {numel=}")
return numel
@contextmanager
def noop_context() -> Any:
yield
def support_distributed_convert(hf_config: AutoConfig) -> bool:
for arch in ["DeepseekV3ForCausalLM", "Qwen3MoeForCausalLM", "Qwen2MoeForCausalLM"]:
if arch in hf_config.architectures:
return True
return False
def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False, test=False, trust_remote_code=False):
os.makedirs(output_path, exist_ok=True)
if len(os.listdir(output_path)) > 0 and not test:
print(f"Output path {output_path} is not empty, skipping conversion")
return
# init torch distributed and mpu
if "WORLD_SIZE" not in os.environ:
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.distributed.init_process_group("nccl")
rank = dist.get_rank()
local_rank = os.getenv("LOCAL_RANK", 0)
world_size = dist.get_world_size()
get_torch_device().set_device(f"{get_device_name()}:{local_rank}")
mpu.initialize_model_parallel(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=world_size,
virtual_pipeline_model_parallel_size=None,
context_parallel_size=1,
expert_model_parallel_size=1,
)
model_parallel_cuda_manual_seed(0)
# init hf config
hf_config = AutoConfig.from_pretrained(hf_model_path)
print(hf_config, flush=True)
if world_size > 1 and not support_distributed_convert(hf_config):
raise NotImplementedError(f"distributed conversion is not supported for {hf_config.architectures} yet.")
pipeline_shards = get_dynamic_pipeline_shards(hf_config.num_hidden_layers, world_size)
print(f"Pipeline shards: {pipeline_shards}", flush=True)
tfconfig = hf_to_mcore_config(
hf_config,
torch.bfloat16,
num_layers_in_first_pipeline_stage=pipeline_shards[0] if len(pipeline_shards) > 1 else None,
num_layers_in_last_pipeline_stage=pipeline_shards[-1] if len(pipeline_shards) > 2 else None,
)
tfconfig.use_cpu_initialization = use_cpu_initialization
tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False)
# init megatron model
def megatron_model_provider(pre_process, post_process):
from verl.models.mcore import init_mcore_model
parallel_model = init_mcore_model(
tfconfig,
hf_config,
pre_process,
post_process,
share_embeddings_and_output_weights=tie_word_embeddings,
value=False,
)
return parallel_model
context: Callable[..., ContextManager] = init_empty_weights if use_cpu_initialization else noop_context
with context():
model = get_model(
model_provider_func=megatron_model_provider,
model_type=ModelType.encoder_or_decoder,
wrap_with_ddp=False,
transformer_config=tfconfig,
)
if use_cpu_initialization:
# convert meta device to empty tensor so it can use `copy_` function
model[0].module = model[0].module.to_empty(device="cpu")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
# init hf model
if "Qwen2_5_VLForConditionalGeneration" in hf_config.architectures:
hf_model = AutoModelForImageTextToText.from_pretrained(
hf_model_path, torch_dtype=torch.bfloat16, trust_remote_code=trust_remote_code
)
else:
hf_model = AutoModelForCausalLM.from_pretrained(
hf_model_path, torch_dtype=torch.bfloat16, trust_remote_code=trust_remote_code
)
hf_state_dict = hf_model.state_dict()
# distributed convert
if world_size > 1 and support_distributed_convert(hf_config):
pipeline_cumsum = np.cumsum(pipeline_shards)
layer_start = 0 if rank == 0 else pipeline_cumsum[rank - 1]
layer_end = pipeline_cumsum[rank]
if "DeepseekV3ForCausalLM" in hf_config.architectures:
numel_partial: int = convert_checkpoint_from_transformers_to_megatron_dpskv3(
hf_model, model[0].module, hf_config, tfconfig=tfconfig, layer_start_end=(layer_start, layer_end)
)
elif "Qwen3MoeForCausalLM" in hf_config.architectures or "Qwen2MoeForCausalLM" in hf_config.architectures:
numel_partial: int = convert_checkpoint_from_transformers_to_megatron(
hf_model, model[0].module, hf_config, layer_start_end=(layer_start, layer_end)
)
else:
raise NotImplementedError(f"Distributed conversion is not supported for {hf_config.architectures} yet.")
numel_tensor = torch.tensor([numel_partial]).to(get_device_name())
dist.all_reduce(numel_tensor, op=dist.ReduceOp.SUM)
numel = int(numel_tensor.cpu().item())
print(f"total numel={numel} vs {hf_model.num_parameters()=}")
if numel != hf_model.num_parameters():
warnings.warn(f"numel mismatch: {numel=} != {hf_model.num_parameters()=}", stacklevel=1)
# load hf state dict to megatron model
elif "Qwen2MoeForCausalLM" in hf_config.architectures:
convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config)
elif "Qwen2_5_VLForConditionalGeneration" in hf_config.architectures:
convert_checkpoint_from_transformers_to_megatron_qwen2_5_vl(hf_model, model[0].module, hf_config)
elif "DeepseekV3ForCausalLM" in hf_config.architectures:
convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model[0].module, hf_config, tfconfig=tfconfig)
elif "Qwen3MoeForCausalLM" in hf_config.architectures:
convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config)
else:
assert not use_cpu_initialization, "use_cpu_initialization is only supported for MoE model"
from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
load_state_dict_to_megatron_gptmodel(
state_dict=hf_state_dict,
wrapped_models=model,
config=hf_config,
params_dtype=torch.bfloat16,
is_value_model=False,
)
megatron_state_dict = model[0].module.sharded_state_dict()
del hf_state_dict, hf_model
# save megatron model
if len(os.listdir(output_path)) == 0:
dist_checkpointing.save(megatron_state_dict, output_path, sharded_strategy=None, async_sharded_save=False)
if test:
test_conversion(megatron_model_provider, tfconfig, output_path, model)
if __name__ == "__main__":
args = _init_args()
convert_hf_to_mcore(
args.hf_model_path, args.output_path, args.use_cpu_initialization, args.test, args.trust_remote_code
)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Diagnose script for checking OS/hardware/python/pip/verl/network.
The output of this script can be a very good hint to issue/problem.
"""
import os
import platform
import socket
import subprocess
import sys
import time
import psutil
try:
from urllib.parse import urlparse
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
from urlparse import urlparse
import argparse
import importlib.metadata
import torch
URLS = {
"PYPI": "https://pypi.python.org/pypi/pip",
}
REGIONAL_URLS = {
"cn": {
"PYPI(douban)": "https://pypi.douban.com/",
"Conda(tsinghua)": "https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
}
}
def test_connection(name, url, timeout=10):
"""Simple connection test"""
urlinfo = urlparse(url)
start = time.time()
try:
socket.gethostbyname(urlinfo.netloc)
except Exception as e:
print("Error resolving DNS for {}: {}, {}".format(name, url, e))
return
dns_elapsed = time.time() - start
start = time.time()
try:
_ = urlopen(url, timeout=timeout)
except Exception as e:
print("Error open {}: {}, {}, DNS finished in {} sec.".format(name, url, e, dns_elapsed))
return
load_elapsed = time.time() - start
print("Timing for {}: {}, DNS: {:.4f} sec, LOAD: {:.4f} sec.".format(name, url, dns_elapsed, load_elapsed))
def check_python():
print("----------Python Info----------")
print("Version :", platform.python_version())
print("Compiler :", platform.python_compiler())
print("Build :", platform.python_build())
print("Arch :", platform.architecture())
def check_pip():
print("------------Pip Info-----------")
try:
import pip
print("Version :", pip.__version__)
print("Directory :", os.path.dirname(pip.__file__))
except ImportError:
print("No corresponding pip install for current python.")
def _get_current_git_commit():
try:
result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f"Error running git command: {e.stderr.strip()}")
return None
except FileNotFoundError:
print("Did not find command: git")
return None
def check_verl():
print("----------verl Info-----------")
try:
sys.path.insert(0, os.getcwd())
import verl
print("Version :", verl.__version__)
verl_dir = os.path.dirname(verl.__file__)
print("Directory :", verl_dir)
try:
commit_hash = _get_current_git_commit()
print("Commit Hash :", commit_hash)
except AttributeError:
print("Commit hash not found. ")
except ImportError as e:
print(f"No verl installed: {e}")
except Exception as e:
import traceback
if not isinstance(e, IOError):
print("An error occurred trying to import verl.")
print("This is very likely due to missing or incompatible library files.")
print(traceback.format_exc())
def check_os():
print("----------Platform Info----------")
print("Platform :", platform.platform())
print("system :", platform.system())
print("node :", platform.node())
print("release :", platform.release())
print("version :", platform.version())
def check_hardware():
print("----------Hardware Info----------")
print("machine :", platform.machine())
print("processor :", platform.processor())
if sys.platform.startswith("darwin"):
pipe = subprocess.Popen(("sysctl", "-a"), stdout=subprocess.PIPE)
output = pipe.communicate()[0]
for line in output.split(b"\n"):
if b"brand_string" in line or b"features" in line:
print(line.strip())
elif sys.platform.startswith("linux"):
subprocess.call(["lscpu"])
elif sys.platform.startswith("win32"):
subprocess.call(["wmic", "cpu", "get", "name"])
def check_network(args):
print("----------Network Test----------")
if args.timeout > 0:
print("Setting timeout: {}".format(args.timeout))
socket.setdefaulttimeout(10)
for region in args.region.strip().split(","):
r = region.strip().lower()
if not r:
continue
if r in REGIONAL_URLS:
URLS.update(REGIONAL_URLS[r])
else:
import warnings
warnings.warn("Region {} do not need specific test, please refer to global sites.".format(r), stacklevel=2)
for name, url in URLS.items():
test_connection(name, url, args.timeout)
def check_environment():
print("----------Environment----------")
for k, v in os.environ.items():
if k.startswith("VERL_") or k.startswith("OMP_") or k.startswith("KMP_") or k == "CC" or k == "CXX":
print('{}="{}"'.format(k, v))
def check_pip_package_versions():
packages = ["vllm", "sglang", "ray", "torch"]
for package in packages:
try:
version = importlib.metadata.version(package)
print(f"{package}\t : {version}")
except importlib.metadata.PackageNotFoundError:
print(f"{package}\t : not found.")
def check_cuda_versions():
if torch.cuda.is_available():
try:
cuda_runtime_version = torch.version.cuda
print(f"CUDA Runtime : {cuda_runtime_version}")
import subprocess
nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
cuda_compiler_version = next((line for line in nvcc_output.splitlines() if "release" in line), None)
if cuda_compiler_version:
print(f"CUDA Compiler : {cuda_compiler_version.strip()}")
else:
print("Could not determine CUDA compiler version.")
except FileNotFoundError as e:
print(f"CUDA compiler : Not found: {e}")
except Exception as e:
print(f"An error occurred while checking CUDA versions: {e}")
else:
print("CUDA is not available.")
def _get_cpu_memory():
"""
Get the total CPU memory capacity in GB.
"""
memory = psutil.virtual_memory()
return memory.total / (1024**3)
def _get_gpu_info():
"""
Get GPU type, GPU memory, and GPU count using nvidia-smi command.
"""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader,nounits"],
capture_output=True,
text=True,
check=True,
)
gpu_lines = result.stdout.strip().split("\n")
gpu_count = len(gpu_lines)
gpu_info = []
for line in gpu_lines:
gpu_name, gpu_memory = line.split(", ")
gpu_info.append(
{
"type": gpu_name,
"memory": float(gpu_memory) / 1024, # Convert to GB
}
)
return gpu_count, gpu_info
except subprocess.CalledProcessError:
print("Failed to execute nvidia-smi command.")
return 0, []
def _get_system_info():
"""
Get CPU memory capacity, GPU type, GPU memory, and GPU count.
"""
cpu_memory = _get_cpu_memory()
gpu_count, gpu_info = _get_gpu_info()
return {"cpu_memory": cpu_memory, "gpu_count": gpu_count, "gpu_info": gpu_info}
def check_system_info():
print("----------System Info----------")
system_info = _get_system_info()
print(f"CPU Memory\t: {system_info['cpu_memory']:.2f} GB")
print(f"GPU Count\t: {system_info['gpu_count']}")
for i, gpu in enumerate(system_info["gpu_info"]):
print(f"GPU {i + 1}\tType : {gpu['type']}")
print(f"GPU {i + 1}\tMemory : {gpu['memory']:.2f} GB")
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Diagnose script for checking the current system.",
)
choices = ["python", "pip", "verl", "system", "os", "environment"]
for choice in choices:
parser.add_argument("--" + choice, default=1, type=int, help="Diagnose {}.".format(choice))
parser.add_argument("--network", default=0, type=int, help="Diagnose network.")
parser.add_argument("--hardware", default=0, type=int, help="Diagnose hardware.")
parser.add_argument(
"--region",
default="",
type=str,
help="Additional sites in which region(s) to test. \
Specify 'cn' for example to test mirror sites in China.",
)
parser.add_argument("--timeout", default=10, type=int, help="Connection test timeout threshold, 0 to disable.")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
if args.python:
check_python()
if args.pip:
check_pip()
check_pip_package_versions()
if args.verl:
check_verl()
if args.os:
check_os()
if args.hardware:
check_hardware()
if args.network:
check_network(args)
if args.environment:
check_environment()
check_cuda_versions()
if args.system:
check_system_info()
#!/usr/bin/env bash
set -euox pipefail
# Define config specifications: "config_name:output_file:config_arg"
CONFIG_SPECS=(
"ppo_trainer:_generated_ppo_trainer.yaml:"
"ppo_megatron_trainer:_generated_ppo_megatron_trainer.yaml:--config-name=ppo_megatron_trainer.yaml"
)
generate_config() {
local config_name="$1"
local output_file="$2"
local config_arg="$3"
local target_cfg="verl/trainer/config/${output_file}"
local tmp_header=$(mktemp)
local tmp_cfg=$(mktemp)
echo "# This reference configration yaml is automatically generated via 'scripts/generate_trainer_config.sh'" > "$tmp_header"
echo "# in which it invokes 'python3 scripts/print_cfg.py --cfg job ${config_arg}' to flatten the 'verl/trainer/config/${config_name}.yaml' config fields into a single file." >> "$tmp_header"
echo "# Do not modify this file directly." >> "$tmp_header"
echo "# The file is usually only for reference and never used." >> "$tmp_header"
echo "" >> "$tmp_header"
python3 scripts/print_cfg.py --cfg job ${config_arg} > "$tmp_cfg"
cat "$tmp_header" > "$target_cfg"
sed -n '/^actor_rollout_ref/,$p' "$tmp_cfg" >> "$target_cfg"
rm "$tmp_cfg" "$tmp_header"
echo "Generated: $target_cfg"
}
for spec in "${CONFIG_SPECS[@]}"; do
IFS=':' read -r config_name output_file config_arg <<< "$spec"
generate_config "$config_name" "$output_file" "$config_arg"
done
for spec in "${CONFIG_SPECS[@]}"; do
IFS=':' read -r config_name output_file config_arg <<< "$spec"
target_cfg="verl/trainer/config/${output_file}"
if ! git diff --exit-code -- "$target_cfg" >/dev/null; then
echo "✖ $target_cfg is out of date. Please regenerate via 'scripts/generate_trainer_config.sh' and commit the changes."
exit 1
fi
done
echo "All good"
exit 0
# Copyright 2025 Bytedance Ltd. and/or its affiliates
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script override a model with custom config and random weights, mainly for create small models for
debugging purposes.
Usage:
python scripts/init_random_model.py \
--hf_model_path <path_to_hf_model> \
--new_config_path <path_to_new_config.json> \
--output_path <path_to_output_model>
"""
import argparse
import json
import os
import warnings
from typing import Any
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
def _init_args():
parser = argparse.ArgumentParser()
parser.add_argument("--hf_model_path", type=str, required=True, help="The path for the huggingface model")
parser.add_argument("--new_config_path", type=str, required=True, help="The path for the new config file")
parser.add_argument("--output_path", type=str, required=True, help="The path for the output random model")
args = parser.parse_args()
return args
def check_output_path(output_path: str):
if os.path.exists(output_path):
warnings.warn(f"Output path '{output_path}' already exists. Will do nothing.", stacklevel=2)
exit()
else:
os.makedirs(output_path, exist_ok=True)
print(f"Output path '{output_path}' created.")
def check_configs(original_config: dict[str, Any], new_config: dict[str, Any]) -> bool:
"""
Check if the original config and new config are compatible.
This is a placeholder function; actual implementation may vary based on requirements.
"""
# Example check: ensure 'model_type' is the same
if new_config.get("model_type", None) is not None and original_config.get("model_type") != new_config.get(
"model_type"
):
raise RuntimeError("Model types do not match.")
for key in new_config:
if key not in original_config:
warnings.warn(
f"Key '{key}' in new config does not exist in original config, may not take effect.", stacklevel=2
)
def init_random_model(hf_model_path, new_config_path, output_path):
config = AutoConfig.from_pretrained(hf_model_path)
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
config_dict = PretrainedConfig.get_config_dict(hf_model_path)[0]
print(config_dict)
with open(new_config_path) as f:
new_config_dict = json.load(f)
check_configs(config_dict, new_config_dict)
config_dict.update(new_config_dict)
new_confg = config.from_dict(config_dict)
print(f"new_config: {new_confg}")
model = AutoModelForCausalLM.from_config(new_confg)
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
new_confg.save_pretrained(output_path)
print(f"Random model initialized and saved to {output_path}")
if __name__ == "__main__":
args = _init_args()
check_output_path(args.output_path)
init_random_model(
hf_model_path=args.hf_model_path, new_config_path=args.new_config_path, output_path=args.output_path
)
#!/bin/bash
USE_MEGATRON=${USE_MEGATRON:-1}
USE_SGLANG=${USE_SGLANG:-1}
export MAX_JOBS=32
echo "1. install inference frameworks and pytorch they need"
if [ $USE_SGLANG -eq 1 ]; then
pip install "sglang[all]==0.4.6.post1" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
fi
pip install --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
echo "2. install basic packages"
pip install "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=15.0.0" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest py-spy pyext pre-commit ruff
pip install "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
echo "3. install FlashAttention and FlashInfer"
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
if [ $USE_MEGATRON -eq 1 ]; then
echo "4. install TransformerEngine and Megatron"
echo "Notice that TransformerEngine installation can take very long time, please be patient"
NVTE_FRAMEWORK=pytorch pip3 install --no-deps git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
pip3 install --no-deps git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
fi
echo "5. May need to fix opencv"
pip install opencv-python
pip install opencv-fixer && \
python -c "from opencv_fixer import AutoFix; AutoFix()"
if [ $USE_MEGATRON -eq 1 ]; then
echo "6. Install cudnn python package (avoid being overridden)"
pip install nvidia-cudnn-cu12==9.8.0.87
fi
echo "Successfully installed all packages"
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is used to merge huggingface model and test verl checkpoints from FSDP and Megatron backends.
To merge FSDP checkpoints:
```sh
python scripts/legacy_model_merger.py merge \
--backend fsdp \
--local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
```
To merge Megatron checkpoints:
```sh
python scripts/legacy_model_merger.py merge \
--backend megatron \
--tie-word-embedding \
--local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
```
For more details, please refer to documentation:
https://verl.readthedocs.io/en/latest/advance/checkpoint.html#convert-fsdp-and-megatron-checkpoints-to-huggingface-format-model
"""
import argparse
import os
import re
import warnings
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional, Union
import numpy as np
import torch
from accelerate import init_empty_weights
from safetensors.torch import load_file
from torch.distributed._tensor import Placement, Shard
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoModelForTokenClassification,
AutoModelForVision2Seq,
GenerationConfig,
PretrainedConfig,
)
try:
# for torch 2.5+
from torch.distributed.tensor import DTensor
except ImportError:
from torch.distributed._tensor import DTensor
from tqdm import tqdm
from verl.utils import hf_processor, hf_tokenizer
@dataclass
class ModelMergerConfig:
operation: str # 'merge' or 'test'
backend: str
local_dir: str
hf_model_config_path: str
target_dir: Optional[str] = "tmp"
hf_upload_path: Optional[str] = None
private: bool = False
test_hf_dir: Optional[str] = None
tie_word_embedding: bool = False
is_value_model: bool = False
hf_model_path: Optional[str] = None
hf_upload: bool = field(init=False)
def __post_init__(self):
self.hf_upload = self.operation == "merge" and bool(self.hf_upload_path)
if self.operation == "test":
self.target_dir = None
self.hf_upload_path = None
self.private = False
class BaseModelMerger(ABC):
def __init__(self, config: ModelMergerConfig):
self.config = config
self.hf_model_config_path = config.hf_model_config_path
if config.hf_model_path:
print(
"Warning: --hf_model_path is deprecated and will be removed in a future version. Currently verl will save huggingface model configuration files into checkpoint directories. Therefore, there is no need to provide --hf_model_path. "
)
self.hf_model_config_path = config.hf_model_path
self.model_config = AutoConfig.from_pretrained(self.hf_model_config_path)
def get_transformers_auto_model_class(self):
if "ForTokenClassification" in self.model_config.architectures[0]:
return AutoModelForTokenClassification
elif "ForCausalLM" in self.model_config.architectures[0]:
return AutoModelForCausalLM
elif "ForConditionalGeneration" in self.model_config.architectures[0]:
return AutoModelForVision2Seq
raise NotImplementedError(f"Unknown architecture {self.model_config.architectures}")
def patch_model_generation_config(self, model):
"""
The generation_config created from model config may be different to the pretrained model,
this may lead to error when generating: https://github.com/volcengine/verl/issues/1246
This function patch the generation_config created from model config to the pretrained model.
"""
if model.can_generate():
try:
model.generation_config = GenerationConfig.from_pretrained(self.hf_model_config_path)
except OSError:
print(
f"Warning: Generation config file not found in {self.hf_model_config_path}, using a generation config created from the model config."
)
return model
def save_lora_adapter(self, state_dict: dict[str, torch.Tensor]):
"""
Save lora adapter to safetensors.
Returns:
lora_path: str, the path to the lora adapter. None if no lora adapter found.
Note:
This function change the 'state_dict' in place.
"""
lora_params_names = [name for name in state_dict.keys() if "lora_" in name]
if len(lora_params_names) == 0:
return None
import json
from typing import OrderedDict
import peft
from safetensors.torch import save_file
lora_params = OrderedDict()
target_modules = set()
lora_key = None
for name in lora_params_names:
lora_key = name.replace(".default.weight", ".weight")
target_modules.add(lora_key.split(".")[-3])
lora_params[lora_key] = state_dict.pop(name)
lora_rank = min(lora_params[lora_key].shape[0], lora_params[lora_key].shape[1])
peft_dict = {
"r": lora_rank,
"lora_alpha": 0, # lora_alpha is not set. An error should be raised to inform the user to set it manually.
"target_modules": list(target_modules),
}
peft_config = peft.LoraConfig(**peft_dict).to_dict()
peft_config["task_type"] = peft_config["task_type"].value if peft_config["task_type"] else None
peft_config["peft_type"] = peft_config["peft_type"].value if peft_config["peft_type"] else None
peft_config["target_modules"] = list(peft_config["target_modules"])
lora_path = os.path.join(self.config.target_dir, "lora_adapter")
os.makedirs(lora_path, exist_ok=True)
with open(os.path.join(lora_path, "adapter_config.json"), "w", encoding="utf-8") as f:
json.dump(peft_config, f, ensure_ascii=False, indent=4)
save_file(lora_params, os.path.join(lora_path, "adapter_model.safetensors"))
for name in list(state_dict.keys()):
key = (
name.replace("base_model.model.", "")
.replace(".base_layer.weight", ".weight")
.replace(".base_layer.bias", ".bias")
)
state_dict[key] = state_dict.pop(name)
return lora_path
def save_hf_model_and_tokenizer(self, state_dict: dict[str, torch.Tensor]):
auto_model_class = self.get_transformers_auto_model_class()
with init_empty_weights():
model = auto_model_class.from_config(self.model_config, torch_dtype=torch.bfloat16)
model.to_empty(device="cpu")
model = self.patch_model_generation_config(model)
lora_path = self.save_lora_adapter(state_dict)
if lora_path:
print(f"Saving lora adapter to {lora_path}")
print(f"Saving model to {self.config.target_dir}")
model.save_pretrained(self.config.target_dir, state_dict=state_dict)
del state_dict
del model
processor = hf_processor(self.hf_model_config_path)
tokenizer = hf_tokenizer(self.hf_model_config_path)
if processor is not None:
print(f"Saving processor to {self.config.target_dir}")
processor.save_pretrained(self.config.target_dir)
if tokenizer is not None:
print(f"Saving tokenizer to {self.config.target_dir}")
tokenizer.save_pretrained(self.config.target_dir)
def upload_to_huggingface(self):
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id=self.config.hf_upload_path, private=self.config.private, exist_ok=True)
api.upload_folder(folder_path=self.config.target_dir, repo_id=self.config.hf_upload_path, repo_type="model")
@abstractmethod
def merge_and_save(self):
raise NotImplementedError("Subclasses should implement this method")
class FSDPModelMerger(BaseModelMerger):
def _get_world_size(self) -> int:
"""Extracts the FSDP world_size from checkpoint filenames (e.g., 'model_world_size_8_rank_0.pt')."""
for filename in os.listdir(self.config.local_dir):
match = re.match(r"model_world_size_(\d+)_rank_0\.pt", filename)
if match:
return int(match.group(1))
raise FileNotFoundError(
f"Could not determine world size. No file matching 'model_world_size_(\d+)_rank_0.pt' found in {self.config.local_dir}"
)
def _load_rank_zero_state_dict(self, world_size: int) -> dict:
return torch.load(
Path(self.config.local_dir) / f"model_world_size_{world_size}_rank_0.pt",
map_location="cpu",
weights_only=False,
)
def _extract_device_mesh_info(self, state_dict: dict, world_size: int) -> tuple[np.ndarray, tuple[str, ...]]:
"""
Retrieves sharding information (device_mesh, mesh_dim_names) from a DTensor in the state_dict.
If no DTensor is found, infers a simple FSDP mesh based on world_size.
"""
pivot_key = sorted(list(state_dict.keys()))[0]
weight = state_dict[pivot_key]
if isinstance(weight, DTensor):
# get sharding info
device_mesh = weight.device_mesh
mesh = device_mesh.mesh
mesh_dim_names = device_mesh.mesh_dim_names
else:
# for non-DTensor
mesh = np.array([world_size], dtype=np.int64)
mesh_dim_names = ("fsdp",)
return mesh, mesh_dim_names
def _calculate_shard_configuration(
self, mesh: np.ndarray, mesh_dim_names: tuple[str, ...]
) -> tuple[int, tuple[int, ...]]:
"""Calculates the total number of shards and the shape of the device mesh."""
assert mesh_dim_names in (("fsdp",), ("ddp", "fsdp")), f"Unsupported mesh_dim_names {mesh_dim_names}"
if "tp" in mesh_dim_names:
# TODO: "tp" is not supported yet due to the above assert
total_shards = mesh.shape[-1] * mesh.shape[-2]
mesh_shape = (mesh.shape[-2], mesh.shape[-1])
else:
total_shards = mesh.shape[-1]
mesh_shape = (mesh.shape[-1],)
return total_shards, mesh_shape
def _merge_by_placement(self, tensors: list[torch.Tensor], placement: Placement) -> torch.Tensor:
"""Merges a list of tensors based on their DTensor placement"""
if placement.is_replicate():
return tensors[0]
elif placement.is_partial():
raise NotImplementedError("Partial placement is not supported yet")
elif placement.is_shard():
return torch.cat(tensors, dim=placement.dim).contiguous()
raise NotImplementedError(f"Unsupported placement: {placement}")
def _load_and_merge_state_dicts(
self, world_size: int, total_shards: int, mesh_shape: tuple[int, ...], mesh_dim_names: tuple[str, ...]
) -> dict[str, torch.Tensor]:
model_state_dict_lst = [None] * total_shards
def process_one_shard(rank: int, model_state_dict_lst: list):
model_path = Path(self.config.local_dir) / f"model_world_size_{world_size}_rank_{rank}.pt"
state_dict = torch.load(model_path, map_location="cpu", weights_only=False)
model_state_dict_lst[rank] = state_dict
return state_dict
with ThreadPoolExecutor(max_workers=min(32, os.cpu_count())) as executor:
futures = [executor.submit(process_one_shard, rank, model_state_dict_lst) for rank in range(total_shards)]
for future in tqdm(futures, desc=f"Loading {total_shards} FSDP shards", total=total_shards):
future.result()
# Merge state dicts from all shards
state_dict = {}
param_placements: dict[str, list] = {}
for key in set(model_state_dict_lst[0].keys()):
state_dict[key] = []
for model_state_shard in model_state_dict_lst:
# add tensor shard in order of rank to state_dict[key]
tensor = model_state_shard.pop(key)
if isinstance(tensor, DTensor):
state_dict[key].append(tensor._local_tensor.bfloat16())
placements = tuple(tensor.placements)
# replicated placement at dp dimension can be discarded
if mesh_dim_names[0] in ("dp", "ddp"):
placements = placements[1:]
if key not in param_placements:
param_placements[key] = placements
else:
assert param_placements[key] == placements
else:
state_dict[key].append(tensor.bfloat16())
del model_state_dict_lst
# Merge tensors
for key in sorted(state_dict):
if not isinstance(state_dict[key], list):
print(f"No need to merge key {key}")
continue
if key in param_placements:
# merge shards
placements: tuple[Shard] = param_placements[key]
if len(mesh_shape) == 1:
# 1-D list, FSDP without TP
assert len(placements) == 1
shards = state_dict[key]
state_dict[key] = self._merge_by_placement(shards, placements[0])
else:
# 2-D list, FSDP + TP
raise NotImplementedError("FSDP + TP is not supported yet")
else:
state_dict[key] = torch.cat(state_dict[key], dim=0)
return state_dict
def merge_and_save(self):
world_size = self._get_world_size()
rank_zero_state_dict = self._load_rank_zero_state_dict(world_size)
mesh, mesh_dim_names = self._extract_device_mesh_info(rank_zero_state_dict, world_size)
print(f"Got device mesh {mesh}, mesh_dim_names {mesh_dim_names}")
total_shards, mesh_shape = self._calculate_shard_configuration(mesh, mesh_dim_names)
print(f"Processing model shards with {total_shards} {mesh_shape} in total")
merged_state_dict = self._load_and_merge_state_dicts(world_size, total_shards, mesh_shape, mesh_dim_names)
if self.config.operation == "test":
if not self.config.test_hf_dir:
raise ValueError("test_hf_dir must be provided for test operation")
self._test_state_dict(merged_state_dict)
elif self.config.operation == "merge":
self.save_hf_model_and_tokenizer(merged_state_dict)
if self.config.hf_upload:
self.upload_to_huggingface()
else:
raise ValueError(f"Unknown operation: {self.config.operation}")
def _test_state_dict(self, state_dict: dict[str, torch.Tensor]):
auto_model_class = self.get_transformers_auto_model_class()
hf_model = auto_model_class.from_pretrained(self.config.test_hf_dir, torch_dtype=torch.bfloat16)
hf_state_dict = hf_model.state_dict()
del hf_model
hf_model_keys = set(hf_state_dict.keys())
collected_keys = set(state_dict.keys())
missing_keys = hf_model_keys - collected_keys
assert len(missing_keys) == 0, f"Missing keys in collected state dict: {list(sorted(missing_keys))}"
extra_keys = collected_keys - hf_model_keys
assert len(extra_keys) == 0, f"Extra keys in collected state dict: {list(sorted(extra_keys))}"
for key in hf_model_keys:
hf_shape = hf_state_dict[key].shape
collected_shape = state_dict[key].shape
assert hf_shape == collected_shape, (
f"Shape mismatch for key '{key}': original {hf_shape} vs collected {collected_shape}"
)
hf_dtype = hf_state_dict[key].dtype
collected_dtype = state_dict[key].dtype
assert hf_dtype == collected_dtype, (
f"Dtype mismatch for key '{key}': original {hf_dtype} vs collected {collected_dtype}"
)
torch.testing.assert_close(hf_state_dict[key], state_dict[key], atol=1e-6, rtol=1e-6)
print("FSDP checks passed: The merged state_dict matches the hf model saved by FSDPCheckpointManager.")
class MegatronModelMerger(BaseModelMerger):
def __init__(self, config: ModelMergerConfig):
from verl.utils.megatron_utils import get_hf_config_and_tokenizer_checkpoint_path
config.hf_model_config_path = get_hf_config_and_tokenizer_checkpoint_path(config.local_dir)
super().__init__(config)
self.params_mapping = {
# megatron core gpt model name, huggingface model name
# NOTICE: It's a little bit tricky, when 2 keys have the same prefix, we need to make sure the longer key within the containing relationship is processed first.
"embedding.word_embeddings": "model.embed_tokens",
# attn
"self_attention.linear_qkv.layer_norm_weight": "input_layernorm.weight",
"self_attention.linear_qkv.layer_norm_bias": "input_layernorm.bias",
"self_attention.linear_qkv": "self_attn.qkv_proj",
"self_attention.q_layernorm": "self_attn.q_norm",
"self_attention.k_layernorm": "self_attn.k_norm",
"self_attention.linear_proj": "self_attn.o_proj",
# mla
"self_attention.linear_q_proj": "self_attn.q_proj",
"self_attention.linear_q_down_proj": "self_attn.q_a_proj",
"self_attention.linear_q_up_proj.layer_norm_weight": "self_attn.q_a_layernorm.weight",
"self_attention.linear_q_up_proj": "self_attn.q_b_proj",
"self_attention.linear_kv_down_proj": "self_attn.kv_a_proj_with_mqa",
"self_attention.linear_kv_up_proj.layer_norm_weight": "self_attn.kv_a_layernorm.weight",
"self_attention.linear_kv_up_proj": "self_attn.kv_b_proj",
# mlp
"pre_mlp_layernorm": "post_attention_layernorm",
"mlp.linear_fc1.layer_norm_weight": "post_attention_layernorm.weight",
"mlp.linear_fc1.layer_norm_bias": "post_attention_layernorm.bias",
"mlp.linear_fc1": "mlp.gate_up_proj",
"mlp.linear_fc2": "mlp.down_proj",
# moe
"mlp.router.expert_bias": "mlp.gate.e_score_correction_bias",
"mlp.router": "mlp.gate",
"mlp.shared_experts.linear_fc1": "mlp.shared_experts.gate_up_proj",
"mlp.shared_experts.linear_fc2": "mlp.shared_experts.down_proj",
"linear_fc1": "gate_up_proj",
"linear_fc2": "down_proj",
# output
"final_layernorm": "norm",
"output_layer": "lm_head",
}
def _get_tp_pp_rank_from_sharded_dir(self, sharded_dir: str) -> tuple[int, int]:
tp_rank = pp_rank = None
rank_list = sharded_dir.split("_")[2:]
if re.match(r"mp_rank_(\d\d)_(\d\d\d)", sharded_dir):
tp_rank = int(rank_list[0])
pp_rank = int(rank_list[1])
elif re.match(r"mp_rank_(\d\d)", sharded_dir):
tp_rank = int(rank_list[0])
pp_rank = 0
assert tp_rank is not None and pp_rank is not None, f"Invalid sharded dir {sharded_dir}"
return tp_rank, pp_rank
def _check_megatron_checkpoint_path(self, model_path: str) -> tuple[list[str], int, int]:
"""
Validates the Megatron checkpoint structure (presence of 'model.pt' in sharded directories).
Determines TP and PP sizes from directory names.
"""
tp_size = 0
pp_size = 0
sharded_dirs = sorted(os.listdir(model_path))
for sharded_dir in sharded_dirs:
assert "model.pt" in os.listdir(Path(model_path) / sharded_dir), f"model.pt not found in {sharded_dir}"
tp_rank, pp_rank = self._get_tp_pp_rank_from_sharded_dir(sharded_dir)
tp_size = max(tp_size, tp_rank + 1)
pp_size = max(pp_size, pp_rank + 1)
return sharded_dirs, tp_size, pp_size
def _merge_across_tp(
self,
key: str,
tp_data: list[torch.Tensor],
config: PretrainedConfig,
tp_size: int,
is_value_model: bool = False,
) -> Union[torch.Tensor, list[torch.Tensor]]:
if "linear_fc1.weight" in key:
# if the tensor is gate and proj
gate_lst = []
up_lst = []
for infer_param in tp_data:
gate, up = infer_param.chunk(2)
gate_lst.append(gate)
up_lst.append(up)
gate = torch.cat(gate_lst, dim=0)
up = torch.cat(up_lst, dim=0)
return [gate, up]
elif "self_attention.linear_qkv." in key and "layer_norm" not in key:
# if the tensor is qkv, for each param on tp, split into q, k, v
# concat q, k, v separately.
q_lst = []
k_lst = []
v_lst = []
assert config.num_attention_heads % config.num_key_value_heads == 0
num_q_per_kv = config.num_attention_heads // config.num_key_value_heads
assert tp_data[0].shape[0] % (num_q_per_kv + 2) == 0
kv_size_per_tp = tp_data[0].shape[0] // (num_q_per_kv + 2)
split_size = [kv_size_per_tp * num_q_per_kv, kv_size_per_tp, kv_size_per_tp]
for infer_param in tp_data:
num_query_groups_per_partition = config.num_key_value_heads // tp_size
for chunk in infer_param.chunk(num_query_groups_per_partition):
split_size = [
kv_size_per_tp * num_q_per_kv // num_query_groups_per_partition,
kv_size_per_tp // num_query_groups_per_partition,
kv_size_per_tp // num_query_groups_per_partition,
]
q, k, v = chunk.split(split_size)
q_lst.append(q)
k_lst.append(k)
v_lst.append(v)
q = torch.cat(q_lst, dim=0)
k = torch.cat(k_lst, dim=0)
v = torch.cat(v_lst, dim=0)
return [q, k, v]
elif "layer_norm" in key or "layernorm" in key or "router" in key or ("output_layer" in key and is_value_model):
return tp_data[0]
else:
dim = 0
if "linear_fc2.weight" in key or "self_attention.linear_proj" in key:
dim = 1
return torch.cat(tp_data, dim=dim)
def _load_state_dicts(
self, model_ckpt_path: str, sharded_dirs: list[str], tp_size: int, pp_size: int
) -> list[list[dict]]:
model_state_dict_lst = [[None for _ in range(tp_size)] for _ in range(pp_size)]
def _process_one_megatron_shard(sharded_dir: str):
model_file_path = Path(model_ckpt_path) / sharded_dir / "model.pt"
state_dict = torch.load(model_file_path, map_location="cpu", weights_only=False)
tp_rank, pp_rank = self._get_tp_pp_rank_from_sharded_dir(sharded_dir)
model_state_dict_lst[pp_rank][tp_rank] = state_dict
with ThreadPoolExecutor(max_workers=min(32, os.cpu_count())) as executor:
futures = [executor.submit(_process_one_megatron_shard, sharded_dir) for sharded_dir in sharded_dirs]
for future in tqdm(futures, desc=f"Loading {len(sharded_dirs)} Megatron shards", total=len(sharded_dirs)):
future.result()
return model_state_dict_lst
def _check_megatron_state_key(self, key: str) -> bool:
"""
Checks if the key is a valid Megatron state key.
Now the model merger only supports keys that start with "decoder/embedding/output_layer" in TransformerLayer.
Shall not use key starts with "model."
"""
if key.startswith("model."):
raise ValueError(
f"Invalid key {key} in Megatron state_dict. Expected keys to start with 'decoder/embedding/output_layer' in TransformerLayer."
)
skip_checking_keys = ["embedding.word_embeddings", "output_layer"]
for skip_key in skip_checking_keys:
if skip_key in key:
print(f"skip checking key {key}")
return
# Exclude extra state keys
if not key.startswith("decoder"):
raise ValueError(
f"Invalid key {key} in Megatron state_dict. Expected keys to start with 'decoder' in TransformerLayer."
)
def _merge_state_dicts(
self, model_state_dict_lst: list[list[dict]], tp_size: int, pp_size: int
) -> dict[str, torch.Tensor]:
state_dict = {}
vpp_size = len(model_state_dict_lst[0][0])
layers_cum = 0
for vpp_rank in range(vpp_size):
for pp_rank in range(pp_size):
layers_handled = 0
keys = model_state_dict_lst[pp_rank][0][vpp_rank].keys()
for key in keys:
if "extra_state" in key:
continue
if self.config.tie_word_embedding and ("output_layer" in key):
print("skip lm_head and reward_head loading because of tie_word_embeddings")
continue
self._check_megatron_state_key(key)
hf_name = self._replace_name(key, self.params_mapping)
assert hf_name is not None, f"Failed to convert layer name [{key}] from megatron to huggingface."
if "model.layers." in hf_name:
local_layer_no = int(hf_name.split(".")[2])
layers_handled = max(local_layer_no, layers_handled)
global_layer_no = local_layer_no + layers_cum
new_key_list = hf_name.split(".")
new_key_list[2] = str(global_layer_no)
hf_name = ".".join(new_key_list)
else:
warnings.warn(f"hf_name {hf_name} will not be fixed with layer number", stacklevel=2)
tp_data = [model_state_dict_lst[pp_rank][tp_rank][vpp_rank][key] for tp_rank in range(tp_size)]
merged = self._merge_across_tp(key, tp_data, self.model_config, tp_size, self.config.is_value_model)
if not isinstance(merged, list):
state_dict[hf_name] = merged
elif len(merged) == 3:
# split qkv
for n, d in zip(["q", "k", "v"], merged):
state_dict[hf_name.replace("qkv", n)] = d
elif len(merged) == 2:
# split gate up
state_dict[hf_name.replace("gate_up", "gate")] = merged[0]
state_dict[hf_name.replace("gate_up", "up")] = merged[1]
print(
f"converted {key} to {hf_name} with shape {merged.shape if isinstance(merged, torch.Tensor) else [t.shape for t in merged]}"
)
layers_cum += layers_handled + 1 # zero based
return state_dict
def merge_and_save(self):
from verl.utils.megatron_utils import get_model_checkpoint_path
model_ckpt_path = get_model_checkpoint_path(self.config.local_dir)
sharded_dirs, tp_size, pp_size = self._check_megatron_checkpoint_path(model_ckpt_path)
print(f"sharded_dirs: {sharded_dirs}, tp_size: {tp_size}, pp_size: {pp_size}, mp_size: {len(sharded_dirs)}")
model_state_dict_lst = self._load_state_dicts(model_ckpt_path, sharded_dirs, tp_size, pp_size)
merged_state_dict = self._merge_state_dicts(model_state_dict_lst, tp_size, pp_size)
del model_state_dict_lst
if self.config.operation == "test":
if not self.config.test_hf_dir:
raise ValueError("test_hf_dir must be provided for test operation")
self._test_state_dict(merged_state_dict)
elif self.config.operation == "merge":
self.save_hf_model_and_tokenizer(merged_state_dict)
if self.config.hf_upload:
self.upload_to_huggingface()
else:
raise ValueError(f"Unknown operation: {self.config.operation}")
def _test_state_dict(self, state_dict: dict[str, torch.Tensor]):
"""
Compares the merged Megatron state_dict against a reference safetensors model.
Applies necessary name mappings from Megatron to Hugging Face conventions using _replace_name.
"""
ref_state_dict = load_file(Path(self.config.test_hf_dir) / "model.safetensors")
for name, loaded_weight in state_dict.items():
# name = self._replace_name(original_name, self.params_mapping)
if not name or name.endswith(".bias") and name not in ref_state_dict:
continue
if "rotary_emb.inv_freq" in name:
continue
if self.config.tie_word_embedding and "lm_head.weight" in name:
continue
if name not in ref_state_dict:
raise RuntimeError(f"key: {name} not exist in state_dict")
param = ref_state_dict[name]
assert loaded_weight.dtype == param.dtype
torch.testing.assert_close(loaded_weight, param, atol=1e-2, rtol=5e-2)
def _replace_name(self, megatron_name: str, name_mapping: dict[str, str]) -> str:
for m_name, v_name in name_mapping.items():
if m_name not in megatron_name:
continue
megatron_name = megatron_name.replace("decoder", "model")
param_name = megatron_name.replace(m_name, v_name)
return param_name
return None # Return None if no mapping found
def main():
parser = argparse.ArgumentParser(description="verl model merger")
subparsers = parser.add_subparsers(dest="operation", required=True, help="Specify 'merge' or 'test' operation.")
base_op_parser = argparse.ArgumentParser(add_help=False)
base_op_parser.add_argument(
"--backend", type=str, required=True, choices=["fsdp", "megatron"], help="The backend of the model"
)
base_op_parser.add_argument("--local_dir", type=str, required=True, help="Path to the saved model checkpoints")
base_op_parser.add_argument(
"--hf_model_path",
type=str,
default=None,
help="(Deprecated) Path to the original Hugging Face model for config.",
)
base_op_parser.add_argument(
"--tie-word-embedding",
action="store_true",
help="Whether to tie word embedding weights (currently only Megatron supported)",
)
base_op_parser.add_argument(
"--is-value-model",
action="store_true",
help="Whether the model is a value model (currently only Megatron supported)",
)
merge_parser = subparsers.add_parser("merge", parents=[base_op_parser], help="Merge model checkpoints and save.")
merge_parser.add_argument(
"--target_dir", default="tmp", type=str, help="Directory to save the merged huggingface model"
)
merge_parser.add_argument(
"--hf_upload_path", default=None, type=str, help="Hugging Face repository ID to upload the model"
)
merge_parser.add_argument(
"--private", action="store_true", help="Whether to upload the model to a private Hugging Face repository"
)
test_parser = subparsers.add_parser(
"test", parents=[base_op_parser], help="Test merged model against a reference Hugging Face model"
)
test_parser.add_argument(
"--test_hf_dir", type=str, required=True, help="Path to the reference Hugging Face model directory for testing"
)
args = parser.parse_args()
common_config_args = {
"operation": args.operation,
"backend": args.backend,
"tie_word_embedding": args.tie_word_embedding,
"is_value_model": args.is_value_model,
"local_dir": args.local_dir,
"hf_model_path": args.hf_model_path,
"hf_model_config_path": args.local_dir,
}
if args.operation == "merge":
config = ModelMergerConfig(
**common_config_args,
target_dir=args.target_dir,
hf_upload_path=args.hf_upload_path,
private=args.private,
test_hf_dir=None,
)
os.makedirs(config.target_dir, exist_ok=True)
elif args.operation == "test":
config = ModelMergerConfig(
**common_config_args,
test_hf_dir=args.test_hf_dir,
# the following args are not used by test operation
target_dir=None,
hf_upload_path=None,
private=False,
)
else:
raise NotImplementedError(f"Unknown operation: {args.operation}")
if config.backend == "fsdp":
merger = FSDPModelMerger(config)
elif config.backend == "megatron":
merger = MegatronModelMerger(config)
else:
raise NotImplementedError(f"Unknown backend: {config.backend}")
merger.merge_and_save()
if __name__ == "__main__":
main()
# Copyright 2025 Bytedance Ltd. and/or its affiliates
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
import hydra
except ImportError as e:
raise ImportError("Please install hydra-core via 'pip install hydra-core' and retry.") from e
@hydra.main(config_path="../verl/trainer/config", config_name="ppo_trainer", version_base=None)
def main(config):
"""Main entry point for PPO training with Hydra configuration management.
Args:
config_dict: Hydra configuration dictionary containing training parameters.
"""
print(config)
from verl.utils.config import omega_conf_to_dataclass
profiler_config = omega_conf_to_dataclass(config.critic.profiler)
print(profiler_config)
if __name__ == "__main__":
main()
# Copyright 2025 Bytedance Ltd. and/or its affiliates
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import re
import traceback
from pathlib import Path
from typing import Annotated, Optional
import aiofiles
try:
import ujson as json
except ImportError:
import json
import typer
from rich.highlighter import ReprHighlighter
from rich.markdown import Markdown
from rich.table import Table
from rich.text import Text
from textual import on
from textual.app import App, ComposeResult
from textual.containers import Horizontal, Vertical, VerticalScroll
from textual.widgets import Input, ProgressBar, Select, SelectionList, Static
INDEX_KEY = "__IDX"
FILE_SUFFIX = ".jsonl"
def check_textual_version():
# check if textual version is equal to 0.52.1
import textual
from packaging.version import Version
if Version(textual.__version__) != Version("0.52.1"):
raise ImportError(f"Textual version {textual.__version__} is not supported, please pip install textual==0.52.1")
check_textual_version()
async def load_path(p: Path, data: dict, mask_strs: str, idx: int, pbar):
samples = []
async with aiofiles.open(p, encoding="utf-8") as f:
async for line in f:
d = json.loads(line)
for k in d:
if isinstance(d[k], str):
if mask_strs:
d[k] = re.sub(rf"{mask_strs}", "*", d[k])
else:
d[k] = json.dumps(d[k], ensure_ascii=False, indent=4)
d[INDEX_KEY] = len(samples)
samples.append(d)
data[idx] = {"samples": samples}
print(f"path {p} loaded")
pbar.advance(1)
async def load_dir(path: Path, data: dict[int, dict], pbar, mask_strs: str = ""):
paths = list(path.glob(f"*{FILE_SUFFIX}"))
paths = sorted(paths, key=lambda x: int(x.stem))
tasks = [load_path(p, data, mask_strs, i, pbar) for i, p in enumerate(paths)]
await asyncio.gather(*tasks)
class Highlighter(ReprHighlighter):
highlights = ReprHighlighter.highlights + [
r"(?P<tag_name>[][\<\>{}()\|()【】\[\]=`])",
r"\<\|(?P<tag_name>[\w\W]*?)\|\>",
]
def center_word_with_equals_exactly(word: str, total_length: int, char: str = "=") -> str:
if len(word) > total_length:
return word
padding = total_length - len(word)
left_pad = (padding) // 2
right_pad = (padding + 1) // 2
return char * left_pad + " " + word + " " + char * right_pad
def highlight_keyword(content: str, keyword: Optional[str]):
if not keyword:
return Text(content)
text = Text()
parts = content.split(keyword)
for i, part in enumerate(parts):
text.append(part, style=None)
if i < len(parts) - 1:
# text.append(keyword, style=Style(color="#d154d1", bgcolor="yellow", bold=True))
text.append(keyword, style="on #8f51b5")
return text
help_doc = """
⌨️ keybinds:
- `f/esc`: find/cancel
- `tab/←/→`: change focus
- `j/k`: page down/up
- `g/G`: scroll home/end
- `n/N`: next sample/step
- `p/P`: previous sample/step
- `s`: switch display mode
- plain text
- rich table
"""
class JsonLineViewer(App):
BINDINGS = [
("left", "focus_previous", "Focus Previous"),
("right", "focus_next", "Focus Next"),
("s", "swith_render", "switch render"),
# control
("n", "next_sample", "Next Sample"),
("N", "next_step", "Next Step"),
("p", "previous_sample", "Previous Sample"),
("P", "previous_step", "Previous Step"),
# search
("f", "toggle_search", "find"),
("enter", "next_search", "find next"),
("escape", "cancel_search", "cancel find"),
# scroll
("j", "page_down", "page down"),
("k", "page_up", "page up"),
("g", "page_home", "page home"),
("G", "page_end", "page end"),
]
CSS = """
Select:focus > SelectCurrent {
border: tall #8f51b5;
}
Select.-expanded > SelectCurrent {
border: tall #8f51b5;
}
#select-container {
width: 15%;
height: 100%;
align: center top;
}
#search-container {
height: 10%;
align: center top;
}
#search-box {
width: 50%;
}
#reqid-box {
width: 50%;
}
"""
def __init__(self, step_num: int, data: dict[int, dict], pbar):
super().__init__()
self.step_num = step_num
self.data = data
self.render_table = False
self.selected_step_index = 0
self.selected_sample_index = 0
self.pbar = pbar
self.matches = []
self.current_match_index = 0
self.highlighter = Highlighter()
first_samples = data[list(data.keys())[0]]["samples"]
# Prepare the initial field filter list (all keys from the first sample)
self.filter_fields = [(f, f, True) for f in first_samples[0].keys()]
# Internal set used for fast membership checks when we add new fields on the fly.
# We keep it here so that when new columns appear in later steps (e.g. `request_id`),
# they can be added to the UI automatically without restarting the viewer.
self._field_set: set[str] = set(first_samples[0].keys())
self.sample_num = len(first_samples)
def compose(self) -> ComposeResult:
with Horizontal(id="search-container"):
yield Input(placeholder="find something...", id="search-box")
yield Input(placeholder="request id...", id="reqid-box")
with Vertical(id="search-container2"):
yield self.pbar
yield Static("", id="search-status")
with Horizontal():
with Vertical(id="select-container"):
yield Static("\n")
yield Static(
renderable=Markdown(
help_doc,
),
markup=False,
)
yield Static("\n")
yield Select(
id="step-select",
value=0,
prompt="select step",
options=[("step: 1", 0)],
allow_blank=False,
)
yield Select(
id="sample-select",
value=0,
prompt="select sample",
options=[("sample: 1", 0)],
allow_blank=False,
)
yield Select(
id="sample-sort",
value=0,
prompt="排序",
options=[
("sort", 0),
("score asc", 1),
("score desc", 2),
],
allow_blank=False,
)
yield SelectionList[int](("Select ALL", 1, True), id="fields-select-all")
with VerticalScroll(id="scroll-view2"):
yield SelectionList[str](*self.filter_fields, id="fields-select")
with VerticalScroll(id="scroll-view"):
yield Static(id="content", markup=False)
async def on_mount(self) -> None:
self.step_select = self.query_one("#step-select", Select)
self.sample_select = self.query_one("#sample-select", Select)
self.sample_sort = self.query_one("#sample-sort", Select)
self.content_display = self.query_one("#content", Static)
self.search_box = self.query_one("#search-box", Input)
self.reqid_box = self.query_one("#reqid-box", Input)
self.scroll_view = self.query_one("#scroll-view", VerticalScroll)
self.search_status = self.query_one("#search-status", Static)
self.fields_select = self.query_one("#fields-select", SelectionList)
self.fields_select.border_title = "field filter"
if self.data:
self.step_select.set_options([(f"step: {i + 1}", i) for i in range(self.step_num)])
self.sample_select.set_options([(f"sample: {i + 1}", i) for i in range(self.sample_num)])
self.step_select.focus()
await self.update_content()
def update_result_options(self, offset: int = 0, sort_desc: Optional[bool] = None):
options = []
if isinstance(self.selected_step_index, int) and self.selected_step_index < len(self.data):
if self.sample_num is None or sort_desc is not None:
samples = self.data[self.selected_step_index].get("samples", [])
if not samples:
self.selected_sample_index = offset
return
if sort_desc is not None:
samples = sorted(
samples,
key=lambda x: x.get("score", x.get("score_1", 0)),
reverse=sort_desc,
)
options = [(f"sample: {r[INDEX_KEY] + 1}", r[INDEX_KEY]) for r in samples]
self.sample_select.set_options(options)
self.sample_num = len(samples)
if sort_desc is not None and options:
self.selected_sample_index = options[0][1]
else:
self.selected_sample_index = offset
async def update_content(self, search_keyword: Optional[str] = None):
content = ""
try:
samples = self.data[self.selected_step_index].get("samples", [])
content_dict_full = samples[self.selected_sample_index]
# Dynamically track any NEW keys that appear and add them to the field filter.
self._update_fields_select(content_dict_full.keys())
# Apply field selection filter (only show selected fields)
content_dict = {k: v for k, v in content_dict_full.items() if k in self.fields_select.selected}
if self.render_table:
content = Table("key", "value", show_lines=True)
for k in content_dict:
v = content_dict[k]
v = f"{v}"
content.add_row(
k,
self.highlighter(highlight_keyword(v, search_keyword)),
)
else:
text = Text()
for k in content_dict:
v = content_dict[k]
s = center_word_with_equals_exactly(k, 64) + f"\n{v}\n"
text.append(highlight_keyword(s, search_keyword))
content = self.highlighter(text)
except KeyError:
content = f"Loading data asynchronously, progress: {len(self.data)}/{self.step_num} step"
except Exception:
content = self.highlighter(traceback.format_exc())
self.content_display.update(content)
# ---------------------------------------------------------------------
# Request-ID jump logic
# ---------------------------------------------------------------------
@on(Input.Submitted, "#reqid-box")
async def on_reqid_submitted(self, event: Input.Submitted) -> None:
"""Jump to the sample that has a matching `request_id`."""
req_id_raw = event.value.strip()
# Remove hyphens so search is tolerant to different id formats
req_id = req_id_raw.replace("-", "")
if not req_id:
return
found = False
for step_idx, step_data in self.data.items():
for sample in step_data.get("samples", []):
sample_id = str(sample.get("request_id", ""))
if sample_id.replace("-", "") == req_id:
# Update selected indices
self.selected_step_index = step_idx
self.step_select.value = step_idx
# Ensure sample list is updated and select sample
self.update_result_options(offset=sample[INDEX_KEY])
self.selected_sample_index = sample[INDEX_KEY]
self.sample_select.value = sample[INDEX_KEY]
await self._clear_search()
await self.update_content()
found = True
break
if found:
break
if not found:
self.search_status.update(Text(f"request_id '{req_id_raw}' not found", style="bold red"))
else:
# Keep the typed id in the input box so users see what was searched.
pass
# ---------------------------------------------------------------------
# Helper: add new fields to SelectionList on-the-fly
# ---------------------------------------------------------------------
def _update_fields_select(self, keys):
"""Add any unseen *keys* to the field-selection widget so they can be toggled.
The viewer is often launched with only the first step loaded. Later steps may
introduce new columns (e.g. `request_id`). This helper ensures those fields
become visible without requiring a restart.
"""
# Ensure we have the widget (only after on_mount)
if not hasattr(self, "fields_select"):
return
for k in keys:
if k not in self._field_set:
self._field_set.add(k)
try:
# By default, new fields are selected so they appear immediately.
self.fields_select.add_option(k, k, selected=True)
except Exception:
# Fallback for older textual versions where signature is different.
self.fields_select.add_option((k, k, True))
@on(Select.Changed, "#step-select")
async def step_changed(self, event):
self.selected_step_index = event.value
self.update_result_options()
await self.update_content()
@on(Select.Changed, "#sample-select")
async def sample_changed(self, event):
self.selected_sample_index = event.value
await self._clear_search()
await self.update_content()
@on(Select.Changed, "#sample-sort")
async def sort_changed(self, event):
v = event.value
self.update_result_options(sort_desc=None if v == 0 else False if v == 1 else True)
await self.update_content()
@on(SelectionList.SelectedChanged, "#fields-select")
async def fields_changed(self, event):
await self.update_content()
@on(SelectionList.SelectedChanged, "#fields-select-all")
async def fields_all_changed(self, event):
s = self.query_one("#fields-select-all", SelectionList)
if s.selected:
self.fields_select.select_all()
else:
self.fields_select.deselect_all()
def action_focus_previous(self):
self.screen.focus_previous()
def action_focus_next(self):
self.screen.focus_next()
async def action_next_step(self) -> None:
self.selected_step_index += 1
if self.selected_step_index >= self.step_num:
self.selected_step_index = 0
self.step_select.value = self.selected_step_index
self.update_result_options()
await self.update_content()
async def action_next_sample(self) -> None:
self.selected_sample_index += 1
if not self.sample_num or self.selected_sample_index >= self.sample_num:
self.selected_sample_index = 0
self.sample_select.value = self.selected_sample_index
await self._clear_search()
await self.update_content()
async def action_previous_step(self) -> None:
self.selected_step_index -= 1
if self.selected_step_index < 0:
self.selected_step_index = self.step_num - 1
self.step_select.value = self.selected_step_index
self.update_result_options()
await self.update_content()
async def action_previous_sample(self) -> None:
self.selected_sample_index -= 1
if self.selected_sample_index < 0:
self.selected_sample_index = self.sample_num - 1
self.sample_select.value = self.selected_sample_index
await self._clear_search()
await self.update_content()
async def action_swith_render(self):
self.render_table = not self.render_table
await self.update_content()
def action_toggle_search(self) -> None:
self.search_box.focus()
async def action_cancel_search(self) -> None:
self.search_box.value = ""
await self._clear_search()
await self.update_content()
async def _clear_search(self):
self.matches = []
self.search_status.update("")
self.current_match_index = 0
@on(Input.Submitted, "#search-box")
async def on_search_submitted(self, event: Input.Submitted) -> None:
self.matches = []
self.current_match_index = 0
if event.value:
await self.update_content(event.value)
renderable = self.content_display.render()
if isinstance(renderable, Table):
return
assert isinstance(renderable, Text)
console = self.content_display._console
lines = renderable.wrap(console, self.scroll_view.container_size.width)
line_idx_recorded = set()
for line_idx, line in enumerate(lines):
if line_idx in line_idx_recorded:
continue
if event.value in line:
self.matches.append(
{
"line": line_idx,
"word": event.value,
}
)
line_idx_recorded.add(line_idx)
self.scroll_view.focus()
await self.action_next_search()
async def action_next_search(self) -> None:
if not self.matches or self.current_match_index >= len(self.matches):
return
target_line = self.matches[self.current_match_index]["line"]
self.scroll_view.scroll_to(x=0, y=target_line * 1, animate=False)
self.current_match_index = (self.current_match_index + 1) % len(self.matches)
self.search_status.update(
Text(
f"Find :{self.current_match_index + 1}/{len(self.matches)}",
style="bold on #8f51b5",
)
)
def action_page_up(self):
self.scroll_view.scroll_page_up(animate=False)
def action_page_down(self):
self.scroll_view.scroll_page_down(animate=False)
def action_page_home(self):
self.scroll_view.scroll_home(animate=False)
def action_page_end(self):
self.scroll_view.scroll_end(animate=False)
async def _run(path: Path, mask_str: str):
assert path.exists(), f"{path} not exist"
paths = list(path.glob(f"*{FILE_SUFFIX}"))
paths = sorted(paths, key=lambda x: int(x.stem))
if not paths:
raise ValueError(f"no available reward dump files under f{path}")
print(f"get jsonl file nums: {len(paths)}")
pbar = ProgressBar(total=len(paths), name="data load progress")
data = {}
await load_path(paths[0], data, mask_str, 0, pbar)
app = JsonLineViewer(step_num=len(paths), data=data, pbar=pbar)
await asyncio.gather(load_dir(path, data, pbar, mask_str), app.run_async())
app = typer.Typer()
@app.command(help="launch TUI APP")
def run(
rollout_data_dir: Path,
mask_str: Annotated[str, typer.Option(help="string that will be masked to *")] = "<\|image_pad\|>|<\|imgpad\|>",
):
loop = asyncio.get_event_loop()
loop.run_until_complete(_run(rollout_data_dir, mask_str))
if __name__ == "__main__":
app()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# setup.py is the fallback installation script when pyproject.toml does not work
import os
from pathlib import Path
from setuptools import find_packages, setup
version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
with open(os.path.join(version_folder, "verl/version/version")) as f:
__version__ = f.read().strip()
install_requires = [
"accelerate",
"codetiming",
"datasets",
"dill",
"hydra-core",
"numpy<2.0.0",
"pandas",
"peft",
"pyarrow>=19.0.0",
"pybind11",
"pylatexenc",
"ray[default]>=2.41.0",
"torchdata",
"tensordict>=0.8.0,<=0.9.1,!=0.9.0",
"transformers",
"wandb",
"packaging>=20.0",
]
TEST_REQUIRES = ["pytest", "pre-commit", "py-spy", "pytest-asyncio"]
PRIME_REQUIRES = ["pyext"]
GEO_REQUIRES = ["mathruler", "torchvision", "qwen_vl_utils"]
GPU_REQUIRES = ["liger-kernel", "flash-attn"]
MATH_REQUIRES = ["math-verify"] # Add math-verify as an optional dependency
VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.9.1,!=0.9.0", "vllm>=0.7.3,<=0.9.1"]
SGLANG_REQUIRES = [
"tensordict>=0.8.0,<=0.9.1,!=0.9.0",
"sglang[srt,openai]==0.4.8",
"torch-memory-saver>=0.0.8",
"torch==2.7.1",
]
TRL_REQUIRES = ["trl<=0.9.6"]
MCORE_REQUIRES = ["mbridge"]
extras_require = {
"test": TEST_REQUIRES,
"prime": PRIME_REQUIRES,
"geo": GEO_REQUIRES,
"gpu": GPU_REQUIRES,
"math": MATH_REQUIRES,
"vllm": VLLM_REQUIRES,
"sglang": SGLANG_REQUIRES,
"trl": TRL_REQUIRES,
"mcore": MCORE_REQUIRES,
}
this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text()
setup(
name="verl",
version=__version__,
package_dir={"": "."},
packages=find_packages(where="."),
url="https://github.com/volcengine/verl",
license="Apache 2.0",
author="Bytedance - Seed - MLSys",
author_email="zhangchi.usc1992@bytedance.com, gmsheng@connect.hku.hk",
description="verl: Volcano Engine Reinforcement Learning for LLM",
install_requires=install_requires,
extras_require=extras_require,
package_data={
"": ["version/*"],
"verl": ["trainer/config/*.yaml"],
},
include_package_data=True,
long_description=long_description,
long_description_content_type="text/markdown",
)
import torch
print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.version.hip)
print(torch.cuda.get_device_name(0))
print(torch.cuda.device_count())
import vllm
print(vllm.__version__)
import multiprocessing
cpu_count = multiprocessing.cpu_count()
print(f"当前系统可用 CPU 核数: {cpu_count}")
\ No newline at end of file
# Tests layout
Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
- `tests/trainer` for testing functionality related to `verl/trainer`
- `tests/models` for testing functionality related to `verl/models`
- ...
There are a few folders with `special_` prefix, created for special purposes:
- `special_distributed`: unit tests that must run with multiple GPUs
- `special_e2e`: end-to-end tests with training/generation scripts
- `special_npu`: tests for NPUs
- `special_sanity`: a suite of quick sanity tests
- `special_standalone`: a set of test that are designed to run in dedicated environments
Accelerators for tests
- By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
- For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# Workflow layout
All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
3. End-to-end tests: `e2e_*.yml`
4. Unit tests
- `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
- `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
- Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
- new workflow yaml is added to `.github/workflows`
- new tests are added to workflow mentioned in 2.
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ray
from omegaconf import DictConfig
from verl.experimental.agent_loop import AgentLoopManager
from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
from verl.single_controller.ray.base import create_colocated_worker_cls
from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerGroup:
# =========================== 1. Create hybrid ActorRollout workers ===========================
actor_rollout_cls = (
AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
)
role_worker_mapping = {
Role.ActorRollout: ray.remote(actor_rollout_cls),
}
global_pool_id = "global_pool"
resource_pool_spec = {
global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
}
mapping = {
Role.ActorRollout: global_pool_id,
}
resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
resource_pool_manager.create_resource_pool()
resource_pool_to_cls = {pool: {} for pool in resource_pool_manager.resource_pool_dict.values()}
# create actor and rollout
resource_pool = resource_pool_manager.get_resource_pool(Role.ActorRollout)
actor_rollout_cls = RayClassWithInitArgs(
cls=role_worker_mapping[Role.ActorRollout], config=config.actor_rollout_ref, role="actor_rollout"
)
resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
all_wg = {}
for resource_pool, class_dict in resource_pool_to_cls.items():
worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls)
spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
all_wg.update(spawn_wg)
actor_rollout_wg = all_wg["actor_rollout"]
actor_rollout_wg.init_model()
if config.actor_rollout_ref.rollout.mode == "sync":
return actor_rollout_wg
# =========================== 2. Create AgentLoopManager ===========================
agent_loop_manager = AgentLoopManager(
config=config,
worker_group=actor_rollout_wg,
)
return agent_loop_manager
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from typing import Any
import numpy as np
import pytest
import ray
from omegaconf import DictConfig
from transformers.utils import get_json_schema
from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager
from verl.experimental.agent_loop.agent_loop import get_trajectory_info
from verl.protocol import DataProto
from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema
from verl.utils import hf_tokenizer
@pytest.fixture
def init_config() -> DictConfig:
from hydra import compose, initialize_config_dir
with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
config = compose(
config_name="ppo_trainer",
overrides=[
"actor_rollout_ref.actor.use_dynamic_bsz=true",
# test sleep/wake_up with fsdp offload
"actor_rollout_ref.actor.fsdp_config.param_offload=True",
"actor_rollout_ref.actor.fsdp_config.optimizer_offload=True",
],
)
model_path = "Qwen/Qwen2.5-1.5B-Instruct"
config.actor_rollout_ref.model.path = model_path
config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
config.actor_rollout_ref.rollout.mode = "async"
config.actor_rollout_ref.rollout.prompt_length = 4096
config.actor_rollout_ref.rollout.response_length = 4096
config.actor_rollout_ref.rollout.n = 4
config.actor_rollout_ref.rollout.agent.num_workers = 2
return config
def test_single_turn(init_config):
ray.init(
runtime_env={
"env_vars": {
"TOKENIZERS_PARALLELISM": "true",
"NCCL_DEBUG": "WARN",
"VLLM_LOGGING_LEVEL": "INFO",
"VLLM_USE_V1": "1",
}
}
)
agent_loop_manager = init_agent_loop_manager(init_config)
raw_prompts = [
[
{
"role": "user",
"content": "Let's play a role playing game. Your name is Alice, your favorite color is blue.",
}
],
[{"role": "user", "content": "Let's play a role playing game. Your name is Bob, your favorite color is red."}],
]
batch = DataProto(
non_tensor_batch={
"raw_prompt": np.array(raw_prompts),
"agent_name": np.array(["single_turn_agent"] * len(raw_prompts)),
},
)
n = init_config.actor_rollout_ref.rollout.n
batch = batch.repeat(n)
result = agent_loop_manager.generate_sequences(prompts=batch)
assert len(result) == len(raw_prompts) * n
# check result
seq_len = result.batch["prompts"].size(1) + result.batch["responses"].size(1)
assert result.batch["input_ids"].size(1) == seq_len
assert result.batch["attention_mask"].size(1) == seq_len
assert result.batch["position_ids"].size(1) == seq_len
# check turns
num_turns = result.non_tensor_batch["__num_turns__"]
assert np.all(num_turns == 2)
print("Test passed!")
ray.shutdown()
class WeatherTool(BaseTool):
def get_current_temperature(self, location: str, unit: str = "celsius"):
"""Get current temperature at a location.
Args:
location: The location to get the temperature for, in the format "City, State, Country".
unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
Returns:
the temperature, the location, and the unit in a dict
"""
print(f"[DEBUG] get_current_temperature: {location}, {unit}")
return {
"temperature": 26.1,
"location": location,
"unit": unit,
}
def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema:
schema = get_json_schema(self.get_current_temperature)
return OpenAIFunctionToolSchema(**schema)
async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> tuple[str, float, dict]:
try:
result = self.get_current_temperature(**parameters)
return json.dumps(result), 0, {}
except Exception as e:
return str(e), 0, {}
class WeatherToolWithData(BaseTool):
def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema:
schema = get_json_schema(self.get_temperature_date)
return OpenAIFunctionToolSchema(**schema)
def get_temperature_date(self, location: str, date: str, unit: str = "celsius"):
"""Get temperature at a location and date.
Args:
location: The location to get the temperature for, in the format "City, State, Country".
date: The date to get the temperature for, in the format "Year-Month-Day".
unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
Returns:
the temperature, the location, the date and the unit in a dict
"""
print(f"[DEBUG] get_temperature_date: {location}, {date}, {unit}")
return {
"temperature": 25.9,
"location": location,
"date": date,
"unit": unit,
}
async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> tuple[str, float, dict]:
try:
result = self.get_temperature_date(**parameters)
return json.dumps(result), 0, {}
except Exception as e:
return str(e), 0, {}
def test_tool_agent(init_config):
ray.init(
runtime_env={
"env_vars": {
"TOKENIZERS_PARALLELISM": "true",
"NCCL_DEBUG": "WARN",
"VLLM_LOGGING_LEVEL": "INFO",
"VLLM_USE_V1": "1",
}
}
)
# =========================== 1. Init rollout manager ===========================
tool_config = {
"tools": [
{
"class_name": "tests.experimental.agent_loop.test_basic_agent_loop.WeatherTool",
"config": {"type": "native"},
},
{
"class_name": "tests.experimental.agent_loop.test_basic_agent_loop.WeatherToolWithData",
"config": {"type": "native"},
},
]
}
tool_config_path = "/tmp/tool_config.json"
with open(tool_config_path, "w") as f:
json.dump(tool_config, f)
n = 2
init_config.actor_rollout_ref.rollout.n = n
init_config.actor_rollout_ref.rollout.multi_turn.tool_config_path = tool_config_path
init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 2
agent_loop_manager = init_agent_loop_manager(init_config)
# =========================== 2. Generate sequences ===========================
raw_prompts = [
[
{"role": "user", "content": "How are you?"},
],
[
{"role": "user", "content": "What's the temperature in Los Angeles now?"},
],
[
{"role": "user", "content": "What's the temperature in New York now?"},
],
[
{
"role": "system",
"content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n\n"
"Current Date: 2024-09-30",
},
{"role": "user", "content": "What's the temperature in San Francisco now? How about tomorrow?"},
],
]
batch = DataProto(
non_tensor_batch={
"raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
"agent_name": np.array(["tool_agent"] * len(raw_prompts)),
},
)
batch = batch.repeat(n)
result = agent_loop_manager.generate_sequences(prompts=batch)
assert len(result) == len(raw_prompts) * n
# Check turns
num_turns = result.non_tensor_batch["__num_turns__"]
print(f"num_turns: {num_turns}")
for i in range(len(num_turns)):
if i // n == 0:
# [user, assistant]
assert num_turns[i] == 2
else:
# [user, assistant, tool, assistant]
assert num_turns[i] == 4
# Check response_mask
tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
responses = result.batch["responses"]
response_mask = result.batch["response_mask"]
attention_mask = result.batch["attention_mask"]
assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
response_length = response_mask.size(1)
for i in range(len(responses)):
# response with tool response
valid_tokens = responses[i][attention_mask[i][-response_length:].bool()]
response_with_obs = tokenizer.decode(valid_tokens)
# response without tool response
valid_tokens = responses[i][response_mask[i].bool()]
response_without_obs = tokenizer.decode(valid_tokens)
assert "<tool_response>" not in response_without_obs, (
f"found <tool_response> in response: {response_without_obs}"
)
assert "</tool_response>" not in response_without_obs, (
f"found </tool_response> in response: {response_without_obs}"
)
print("=========================")
print(response_with_obs)
print("---")
print(response_without_obs)
print("Test passed!")
ray.shutdown()
@pytest.mark.asyncio
async def test_get_trajectory_info():
"""Tests the get_trajectory_info method."""
# Initialize the class to set up class-level attributes
step = 10
index = [1, 1, 3, 3]
expected_info = [
{"step": step, "sample_index": 1, "rollout_n": 0, "validate": False},
{"step": step, "sample_index": 1, "rollout_n": 1, "validate": False},
{"step": step, "sample_index": 3, "rollout_n": 0, "validate": False},
{"step": step, "sample_index": 3, "rollout_n": 1, "validate": False},
]
trajectory_info = await get_trajectory_info(step, index, validate=False)
assert trajectory_info == expected_info
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023-2024 SGLang Team
# Copyright 2025 ModelBest Inc. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment