Commit 20247eb8 authored by chenych's avatar chenych
Browse files

Update 0506

parent 6065b946
......@@ -101,6 +101,13 @@ pip install "tensordict<0.6"
pip install -e .
```
### 数据集
可根据下面的样例数据去构造自己的数据集
- Text dataset: https://huggingface.co/datasets/hiyouga/math12k
- Image-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
### GRPO 训练
```bash
......@@ -114,7 +121,8 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
```
> [!NOTE]
> 如果无法连接到Hugging Face,请先安装`pip install -U huggingface_hub hf_transfer`,再在启动前增加 `export HF_ENDPOINT=https://hf-mirror.com`命令
>
> 如果您想使用SwanLab日志记录器,请考虑使用 `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
## 自定义数据集
......@@ -137,3 +145,17 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
这些功能目前暂时禁用,我们计划在未来的更新中逐一修复。
- 视觉语言模型目前不兼容 padding-free 训练和 DeepSpeed Ulysses并行方法。
### 常见问题及解决办法
> ValueError: Image features and image tokens do not match: tokens: 8192, features 9800
增大`data.max_prompt_length`的数值或者减小`data.max_pixels` 的数值.
> RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62
减小`worker.rollout.gpu_memory_utilization`的数值并且确认开启 `worker.actor.offload.offload_params`.
> RuntimeError: 0 active drivers ([]). There should only be one.
在当前python环境下卸载 `deepspeed`.
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......@@ -9,6 +13,7 @@ python3 -m verl.trainer.main \
data.format_prompt=./examples/format_prompt/r1v_format.jinja \
worker.actor.model.model_path=${MODEL_PATH} \
worker.rollout.tensor_parallel_size=1 \
worker.reward.reward_type=sequential \
worker.reward.reward_function=./examples/reward_function/r1v.py:compute_score \
trainer.experiment_name=qwen2_5_vl_3b_clevr \
trainer.n_gpus_per_node=2
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......@@ -9,6 +13,7 @@ python3 -m verl.trainer.main \
data.format_prompt=./examples/format_prompt/r1v_format.jinja \
worker.actor.model.model_path=${MODEL_PATH} \
worker.rollout.tensor_parallel_size=1 \
worker.reward.reward_type=sequential \
worker.reward.reward_function=./examples/reward_function/r1v.py:compute_score \
trainer.experiment_name=qwen2_5_vl_3b_geoqa8k \
trainer.n_gpus_per_node=8
......@@ -9,6 +9,7 @@ data:
rollout_batch_size: 512
val_batch_size: 1024
format_prompt: ./examples/format_prompt/math_format.jinja
override_chat_template: null
shuffle: true
seed: 1
max_pixels: 4194304
......@@ -70,16 +71,17 @@ worker:
offload_params: false
reward:
reward_type: function
reward_type: batch
reward_function: ./examples/reward_function/math.py:compute_score
trainer:
total_episodes: 15
logger: ["console", "wandb"]
total_epochs: 15
max_steps: null
project_name: easy_r1
experiment_name: qwen2_5_7b_math_grpo
n_gpus_per_node: 8
logger: ["console", "wandb"]
nnodes: 1
n_gpus_per_node: 8
val_freq: 5 # -1 to disable
val_before_train: true
val_only: false
......
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-7B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
config=examples/config.yaml \
data.train_files=hiyouga/math12k@train \
data.val_files=hiyouga/math12k@test \
worker.actor.model.model_path=${MODEL_PATH} \
trainer.experiment_name=qwen2_5_7b_math_grpo \
trainer.n_gpus_per_node=8
worker.actor.model.model_path=${MODEL_PATH}
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-32B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......
#!/bin/bash
# REMINDER: this script uses test data split and should ONLY be used for debugging. DO NOT use for training.
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct # replace it with your local file path
python3 -m verl.trainer.main \
......
#!/bin/bash
set -x
export PYTHONUNBUFFERED=1
MODEL_PATH=Qwen/Qwen3-4B # replace it with your local file path
python3 -m verl.trainer.main \
config=examples/config.yaml \
data.max_response_length=4096 \
worker.actor.model.model_path=${MODEL_PATH} \
trainer.experiment_name=qwen3_4b_math_grpo
......@@ -13,28 +13,34 @@
# limitations under the License.
import re
from typing import Dict
from typing import Dict, List
from mathruler.grader import extract_boxed_content, grade_answer
def format_reward(predict_str: str) -> float:
def format_reward(predict: str) -> float:
pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
format_match = re.fullmatch(pattern, predict_str)
format_match = re.fullmatch(pattern, predict)
return 1.0 if format_match else 0.0
def accuracy_reward(predict_str: str, ground_truth: str) -> float:
answer = extract_boxed_content(predict_str)
def accuracy_reward(predict: str, ground_truth: str) -> float:
answer = extract_boxed_content(predict)
return 1.0 if grade_answer(answer, ground_truth) else 0.0
def compute_score(predict_str: str, ground_truth: str, format_weight: float = 0.1) -> Dict[str, float]:
predict_str = re.sub(r"\s*(<|>|/)\s*", r"\1", predict_str) # handle qwen2.5vl-32b format
format_score = format_reward(predict_str)
accuracy_score = accuracy_reward(predict_str, ground_truth)
return {
"overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
"format": format_score,
"accuracy": accuracy_score,
}
def compute_score(predicts: List[str], ground_truths: List[str], format_weight: float = 0.1) -> List[Dict[str, float]]:
scores = []
for predict, ground_truth in zip(predicts, ground_truths):
predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict) # handle qwen2.5vl-32b format
format_score = format_reward(predict)
accuracy_score = accuracy_reward(predict, ground_truth)
scores.append(
{
"overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
"format": format_score,
"accuracy": accuracy_score,
}
)
return scores
......@@ -18,16 +18,16 @@ from typing import Dict
from mathruler.grader import grade_answer
def format_reward(predict_str: str) -> float:
def format_reward(predict: str) -> float:
pattern = re.compile(r"<think>.*?</think>\s*<answer>.*?</answer>", re.DOTALL)
format_match = re.fullmatch(pattern, predict_str)
format_match = re.fullmatch(pattern, predict)
return 1.0 if format_match else 0.0
def accuracy_reward(predict_str: str, ground_truth: str) -> float:
def accuracy_reward(predict: str, ground_truth: str) -> float:
try:
content_match = re.search(r"<answer>(.*?)</answer>", predict_str)
given_answer = content_match.group(1).strip() if content_match else predict_str.strip()
content_match = re.search(r"<answer>(.*?)</answer>", predict)
given_answer = content_match.group(1).strip() if content_match else predict.strip()
if grade_answer(given_answer, ground_truth.strip()):
return 1.0
......@@ -37,9 +37,9 @@ def accuracy_reward(predict_str: str, ground_truth: str) -> float:
return 0.0
def compute_score(predict_str: str, ground_truth: str, format_weight: float = 0.5) -> Dict[str, float]:
format_score = format_reward(predict_str)
accuracy_score = accuracy_reward(predict_str, ground_truth)
def compute_score(predict: str, ground_truth: str, format_weight: float = 0.5) -> Dict[str, float]:
format_score = format_reward(predict)
accuracy_score = accuracy_reward(predict, ground_truth)
return {
"overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
"format": format_score,
......
......@@ -3,6 +3,7 @@ excludes: ["/.git/"]
env_vars:
TOKENIZERS_PARALLELISM: "true"
NCCL_DEBUG: "WARN"
VLLM_LOGGING_LEVEL: "INFO"
VLLM_LOGGING_LEVEL: "WARN"
TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
PYTHONUNBUFFERED: "1"
......@@ -20,7 +20,7 @@ from .transformers.qwen2_vl import qwen2_vl_attn_forward
def apply_ulysses_patch(model_type: str) -> None:
if model_type in ("llama", "gemma", "gemma2", "mistral", "qwen2"):
if model_type in ("llama", "gemma", "gemma2", "mistral", "qwen2", "qwen3", "qwen3_moe"):
ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
elif model_type in ("qwen2_vl", "qwen2_5_vl"):
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLFlashAttention2
......
......@@ -43,6 +43,7 @@ class DataConfig:
rollout_batch_size: int = 512
val_batch_size: int = -1
format_prompt: Optional[str] = None
override_chat_template: Optional[str] = None
shuffle: bool = True
seed: int = 1
max_pixels: int = 4194304
......@@ -73,7 +74,7 @@ class AlgorithmConfig:
@dataclass
class TrainerConfig:
total_episodes: int = 10
total_epochs: int = 10
max_steps: Optional[int] = None
project_name: str = "easy_r1"
experiment_name: str = "demo"
......
......@@ -20,7 +20,7 @@ from omegaconf import OmegaConf
from ..single_controller.ray import RayWorkerGroup
from ..utils.tokenizer import get_processor, get_tokenizer
from ..workers.fsdp_workers import FSDPWorker
from ..workers.reward import FunctionRewardManager
from ..workers.reward import BatchFunctionRewardManager, SequentialFunctionRewardManager
from .config import PPOConfig
from .data_loader import create_dataloader
from .ray_trainer import RayPPOTrainer, ResourcePoolManager, Role
......@@ -38,11 +38,13 @@ class Runner:
# instantiate tokenizer
tokenizer = get_tokenizer(
config.worker.actor.model.model_path,
override_chat_template=config.data.override_chat_template,
trust_remote_code=config.worker.actor.model.trust_remote_code,
use_fast=True,
)
processor = get_processor(
config.worker.actor.model.model_path,
override_chat_template=config.data.override_chat_template,
trust_remote_code=config.worker.actor.model.trust_remote_code,
use_fast=True,
)
......@@ -65,7 +67,14 @@ class Runner:
}
resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
RemoteRewardManager = ray.remote(FunctionRewardManager).options(num_cpus=config.worker.reward.num_cpus)
if config.worker.reward.reward_type == "sequential":
RewardManager = SequentialFunctionRewardManager
elif config.worker.reward.reward_type == "batch":
RewardManager = BatchFunctionRewardManager
else:
raise NotImplementedError(f"Unknown reward type {config.worker.reward.reward_type}.")
RemoteRewardManager = ray.remote(RewardManager).options(num_cpus=config.worker.reward.num_cpus)
reward_fn = RemoteRewardManager.remote(config.worker.reward, tokenizer)
val_reward_fn = RemoteRewardManager.remote(config.worker.reward, tokenizer)
......@@ -117,7 +126,6 @@ def main():
runtime_env=runtime_env)
else:
ray.init(runtime_env=runtime_env)
runner = Runner.remote()
ray.get(runner.run.remote(ppo_config))
......
......@@ -247,7 +247,7 @@ class RayPPOTrainer:
if config.trainer.max_steps is not None:
self.training_steps = config.trainer.max_steps
else:
self.training_steps = len(train_dataloader) * config.trainer.total_episodes
self.training_steps = len(train_dataloader) * config.trainer.total_epochs
config.worker.actor.optim.training_steps = self.training_steps
config.worker.critic.optim.training_steps = self.training_steps
......@@ -473,7 +473,7 @@ class RayPPOTrainer:
if self.config.trainer.val_only:
return
for _ in tqdm(range(self.config.trainer.total_episodes), desc="Episode", position=0):
for _ in tqdm(range(self.config.trainer.total_epochs), desc="Epoch", position=0):
for batch_dict in tqdm(self.train_dataloader, desc="Running step", position=1):
self.global_step += 1
if self.global_step > self.training_steps:
......
......@@ -55,11 +55,13 @@ class FSDPCheckpointManager(BaseCheckpointManager):
# every rank download its own checkpoint
model_path = os.path.join(path, f"model_world_size_{self.world_size}_rank_{self.rank}.pt")
optim_path = os.path.join(path, f"optim_world_size_{self.world_size}_rank_{self.rank}.pt")
extra_state_path = os.path.join(path, f"extra_state_world_size_{self.world_size}_rank_{self.rank}.pt")
print(f"[rank-{self.rank}]: Loading from {model_path} and {optim_path} and {extra_state_path}.")
extra_path = os.path.join(path, f"extra_state_world_size_{self.world_size}_rank_{self.rank}.pt")
print(f"[rank-{self.rank}]: Loading model from {os.path.abspath(model_path)}.")
print(f"[rank-{self.rank}]: Loading optimizer from {os.path.abspath(optim_path)}.")
print(f"[rank-{self.rank}]: Loading extra_state from {os.path.abspath(extra_path)}.")
model_state_dict = torch.load(model_path, weights_only=False)
optim_state_dict = torch.load(optim_path, weights_only=False)
extra_state_dict = torch.load(extra_state_path, weights_only=False)
extra_state_dict = torch.load(extra_path, weights_only=False)
state_dict_options = StateDictOptions(cpu_offload=True)
set_state_dict(
......@@ -91,7 +93,7 @@ class FSDPCheckpointManager(BaseCheckpointManager):
extra_path = os.path.join(path, f"extra_state_world_size_{self.world_size}_rank_{self.rank}.pt")
print(f"[rank-{self.rank}]: Saving model to {os.path.abspath(model_path)}.")
print(f"[rank-{self.rank}]: Saving checkpoint to {os.path.abspath(model_path)}.")
print(f"[rank-{self.rank}]: Saving optimizer to {os.path.abspath(optim_path)}.")
print(f"[rank-{self.rank}]: Saving extra_state to {os.path.abspath(extra_path)}.")
torch.save(model_state_dict, model_path)
torch.save(optim_state_dict, optim_path)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment