v0.3.0

496acb03 · chenych · d8de2ca8 · d8de2ca8 · d8de2ca8 · 496acb03
Commit 496acb03 authored Jul 10, 2025 by chenych
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
-# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-ARG VLLM_COMMIT=227578480d71fc94ef46ca77fb69496412158d68
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini && \
-    apt-get clean
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-# Uninstall nv-pytorch fork
-RUN pip uninstall -y torch torchvision torchaudio \
-    pytorch-quantization pytorch-triton torch-tensorrt \
-    xgboost transformer_engine flash_attn apex megatron-core
-# Install vllm-0.7.4-nightly
-RUN pip install --no-cache-dir vllm --pre --extra-index-url "https://wheels.vllm.ai/${VLLM_COMMIT}" && \
-    git clone -b verl_v1 https://github.com/hiyouga/vllm.git && \
-    cp -r vllm/vllm/ /usr/local/lib/python3.10/dist-packages/
-# Install torch-2.5.1
-RUN pip install --no-cache-dir torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 tensordict torchdata \
-    transformers>=4.49.0 accelerate datasets peft hf-transfer \
-    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel mathruler \
-    pytest yapf py-spy pyext pre-commit ruff
-# Install flash_attn-2.7.4.post1
-RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-# Fix cv2
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
-    pip install --no-cache-dir --upgrade optree>=0.13.0
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
--- a/Dockerfile.nightly
+++ b/Dockerfile.nightly
-# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini && \
-    apt-get clean
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-# Uninstall nv-pytorch fork
-RUN pip uninstall -y torch torchvision torchaudio \
-    pytorch-quantization pytorch-triton torch-tensorrt \
-    xgboost transformer_engine flash_attn apex megatron-core
-# Install torch-2.6.0 + vllm-0.8.2
-RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
-    transformers>=4.49.0 accelerate datasets peft hf-transfer \
-    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel mathruler \
-    pytest yapf py-spy pyext pre-commit ruff
-# Install flash_attn-2.7.4.post1
-RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-# Fix cv2
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
-    pip install --no-cache-dir --upgrade optree>=0.13.0
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ EasyR1基于 **[HybirdEngine](https://arxiv.org/abs/2409.19256)** 和最新发
 - Python 3.10+
 - transformers>=4.49.0
 - flash-attn==2.6.1+das.opt4.dtk2504
- vllm>=0.7.3
+- vllm>=0.8.5
 ### 硬件依赖
@@ -52,10 +52,10 @@ EasyR1基于 **[HybirdEngine](https://arxiv.org/abs/2409.19256)** 和最新发
 ####  Docker（方法一）
-基于光源pytorch2.4.1+dtk25.04基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch2.4.1、python、dtk及系统下载对应的镜像版本。
+基于光源pytorch2.4.1+dtk25.04.1基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch2.4.1、python、dtk及系统下载对应的镜像版本。
 ```bash
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
+docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250705
 docker run -it --shm-size 200g --network=host --name docker_name --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro imageID bash
@@ -71,7 +71,7 @@ export LLAMA_NN=0
 ```bash
 cd docker
-docker build --no-cache -t llama-factory:latest .
+docker build --no-cache -t easyR1:latest .
 docker run -it --shm-size 200g --network=host --name docker_name --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro imageID bash
 ## 安装所需环境包
@@ -86,12 +86,12 @@ export LLAMA_NN=0
 #### Anaconda（方法三）
 关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
 ```bash
-DTK驱动: dtk25.04
+DTK驱动: dtk25.04.1
 python: 3.10
-torch: 2.4.1
+torch: 2.4.1+das.opt1.dtk25041
-deepspeed: 0.14.2+das.opt2.dtk2504
+deepspeed: 0.14.2+das.opt1.dtk25041
-flash-attn: 2.6.1+das.opt4.dtk2504
+flash-attn: 2.6.1+das.opt1.dtk25041
-vllm: 0.7.2+das.opt1.c137085.dtk2504
+vllm: 0.8.5.post1+das.opt2.dtk25041
 ```
 `Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
@@ -114,6 +114,7 @@ export LLAMA_NN=0
 - Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
 ### GRPO 训练
 如果无法连接到Hugging Face，请先安装`pip install -U huggingface_hub hf_transfer`，再在启动前增加 `export HF_ENDPOINT=https://hf-mirror.com`命令
 ```bash

--- a/README_en.md
+++ b/README_en.md
@@ -42,7 +42,7 @@ We provide a [Dockerfile](./Dockerfile) to easily build environments.
 We recommend using the [pre-built docker image](https://hub.docker.com/r/hiyouga/verl) in EasyR1.
 ```bash
-docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0
+docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
 ```
 ### Hardware Requirements
@@ -136,10 +136,6 @@ We also reproduced the following two baselines of the [R1-V](https://github.com/
 - [CLEVR-70k-Counting](examples/baselines/qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
 - [GeoQA-8k](examples/baselines/qwen2_5_vl_3b_geoqa8k.sh): Train the Qwen2.5-VL-3B-Instruct model on GeoQA problem.
-## Performance Baselines
-See [baselines.md](assets/baselines.md).
 ## Awesome Work using EasyR1
 - **MMR1**: Advancing the Frontiers of Multimodal Reasoning. [![[code]](https://img.shields.io/github/stars/LengSicong/MMR1)](https://github.com/LengSicong/MMR1)
@@ -147,8 +143,6 @@ See [baselines.md](assets/baselines.md).
 - **Seg-Zero**: Reasoning-Chain Guided Segmentation via Cognitive Reinforcement. [![[code]](https://img.shields.io/github/stars/dvlab-research/Seg-Zero)](https://github.com/dvlab-research/Seg-Zero) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06520-blue)](https://arxiv.org/abs/2503.06520)
 - **MetaSpatial**: Reinforcing 3D Spatial Reasoning in VLMs for the Metaverse. [![[code]](https://img.shields.io/github/stars/PzySeere/MetaSpatial)](https://github.com/PzySeere/MetaSpatial) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.18470-blue)](https://arxiv.org/abs/2503.18470)
 - **Temporal-R1**: Envolving Temporal Reasoning Capability into LMMs via Temporal Consistent Reward. [![[code]](https://img.shields.io/github/stars/appletea233/Temporal-R1)](https://github.com/appletea233/Temporal-R1)
- **NoisyRollout**: Reinforcing Visual Reasoning with Data Augmentation. [![[code]](https://img.shields.io/github/stars/John-AI-Lab/NoisyRollout)](https://github.com/John-AI-Lab/NoisyRollout) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.13055-blue)](https://arxiv.org/pdf/2504.13055)
- **GUI-R1**: A Generalist R1-Style Vision-Language Action Model For GUI Agents. [![[code]](https://img.shields.io/github/stars/ritzz-ai/GUI-R1)](https://github.com/ritzz-ai/GUI-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.10458-blue)](https://arxiv.org/abs/2504.10458)
 ## TODO

--- a/assets/baselines.md
+++ b/assets/baselines.md
-# Baselines
-Environment: [hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0](https://hub.docker.com/layers/hiyouga/verl/ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0/images/sha256-335ed6cd1fe73090e458409cfa4394d6abf4cd0503ca44dbafdc28ff72e5ed20)
-EasyR1 version: [v0.3.0](https://github.com/hiyouga/EasyR1/tree/v0.3.0)
-Welcome to contribute new data points!
-## Algorithm Baselines
-### [Qwen2.5-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on [Math12k](https://huggingface.co/datasets/hiyouga/math12k)
-| Size | Algorithm   | Bits | LR   | KL   | Test Score |
-| ---- | ----------- | ---- | ---- | ---- | ---------- |
-| 7B   | GRPO        | AMP  | 1e-6 | 1e-2 | 0.73->0.79 |
-### [Qwen2.5-VL-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) on [Geometry3k](https://huggingface.co/datasets/hiyouga/geometry3k)
-| Size | Algorithm   | Bits | LR   | KL   | Test Score |
-| ---- | ----------- | ---- | ---- | ---- | ---------- |
-| 7B   | GRPO        | AMP  | 1e-6 | 1e-2 | 0.39->0.52 |
-| 7B   | GRPO        | BF16 | 1e-6 | 1e-2 | 0.39->0.52 |
-| 7B   | GRPO        | AMP  | 1e-6 | 1e-3 | 0.39->0.52 |
-| 7B   | RLOO        | AMP  | 1e-6 | 1e-2 | 0.39->0.53 |
-| 3B   | GRPO        | AMP  | 1e-6 | 1e-2 | 0.27->0.44 |
-| 32B  | GRPO        | BF16 | 1e-6 | 1e-2 | 0.46->0.61 |
-> [!NOTE]
-> The hyper-parameters not listed are all the same as the default values.
-## Performance Baselines
-### [Qwen2.5-VL-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) on [Geometry3k](https://huggingface.co/datasets/hiyouga/geometry3k)
-| Size | GPU Type      | Bits | Batch Size | vLLM Util | vLLM TP | Peak Mem | Peak VRAM | Throughput | Sec per step | Actor MFU |
-| ---- | ------------- | ---- | ---------- | --------- | ------- | -------- | --------- | ---------- | ------------ | --------- |
-| 3B   | 8 * H100 80GB | AMP  | 4 / 16     | 0.6       | 2       | 120GB    | 35GB      | 1200       | 180s         | 6.3%      |
-| 7B   | 8 * H100 80GB | AMP  | 4 / 16     | 0.6       | 2       | 140GB    | 60GB      | 1200       | 180s         | 13.6%     |
-| 7B   | 8 * H100 80GB | AMP  | 10 / 20    | 0.6       | 2       | 150GB    | 75GB      | 1400       | 170s         | 19.2%     |
-| 7B   | 8 * L20 48GB  | AMP  | 4 / 16     | 0.6       | 2       | 150GB    | 44GB      | 410        | 580s         | 26.5%     |
-| 7B   | 8 * H100 80GB | BF16 | 4 / 16     | 0.6       | 2       | 150GB    | 50GB      | 1280       | 190s         | 13.9%     |
-| 32B  | 8 * H100 80GB | BF16 | 1 / 8      | 0.6       | 8       | 240GB    | 68GB      | 360        | 860s         | 11.2%     |
- Batch Size: micro_batch_size_per_device_for_update / micro_batch_size_per_device_for_experience
- vLLM Util: rollout.gpu_memory_utilization
- vLLM TP: rollout.tensor_parallel_size
- Peak Mem: Peak CPU memory usage
- Peak VRAM: Peak GPU memory usage
- Throughput: Number of tokens per second per GPU by one training step
- Sec per step: Average time per step in seconds
-> [!NOTE]
-> The hyper-parameters not listed are all the same as the default values.
--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10
+FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250705
\ No newline at end of file
--- a/examples/baselines/qwen2_5_vl_3b_clevr.sh
+++ b/examples/baselines/qwen2_5_vl_3b_clevr.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \
@@ -13,7 +9,6 @@ python3 -m verl.trainer.main \
    data.format_prompt=./examples/format_prompt/r1v_format.jinja \
    worker.actor.model.model_path=${MODEL_PATH} \
    worker.rollout.tensor_parallel_size=1 \
-    worker.reward.reward_type=sequential \
+    worker.reward.score_function=./examples/score_function/r1v.py:compute_score \
-    worker.reward.reward_function=./examples/reward_function/r1v.py:compute_score \
    trainer.experiment_name=qwen2_5_vl_3b_clevr \
    trainer.n_gpus_per_node=2
--- a/examples/baselines/qwen2_5_vl_3b_geoqa8k.sh
+++ b/examples/baselines/qwen2_5_vl_3b_geoqa8k.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \
@@ -13,7 +9,6 @@ python3 -m verl.trainer.main \
    data.format_prompt=./examples/format_prompt/r1v_format.jinja \
    worker.actor.model.model_path=${MODEL_PATH} \
    worker.rollout.tensor_parallel_size=1 \
-    worker.reward.reward_type=sequential \
+    worker.reward.score_function=./examples/score_function/r1v.py:compute_score \
-    worker.reward.reward_function=./examples/reward_function/r1v.py:compute_score \
    trainer.experiment_name=qwen2_5_vl_3b_geoqa8k \
    trainer.n_gpus_per_node=8
--- a/examples/config.yaml
+++ b/examples/config.yaml
@@ -7,9 +7,8 @@ data:
  max_prompt_length: 2048
  max_response_length: 2048
  rollout_batch_size: 512
-  val_batch_size: 1024
+  val_batch_size: -1
  format_prompt: ./examples/format_prompt/math_format.jinja
-  override_chat_template: null
  shuffle: true
  seed: 1
  max_pixels: 4194304
@@ -71,17 +70,16 @@ worker:
      offload_params: false
  reward:
-    reward_type: batch
+    reward_type: function
-    reward_function: ./examples/reward_function/math.py:compute_score
+    score_function: ./examples/score_function/math.py:compute_score
 trainer:
-  total_epochs: 15
+  total_episodes: 15
-  max_steps: null
+  logger: ["console", "wandb"]
  project_name: easy_r1
  experiment_name: qwen2_5_7b_math_grpo
-  logger: ["console", "wandb"]
-  nnodes: 1
  n_gpus_per_node: 8
+  nnodes: 1
  val_freq: 5  # -1 to disable
  val_before_train: true
  val_only: false

--- a/examples/qwen2_5_7b_math_grpo.sh
+++ b/examples/qwen2_5_7b_math_grpo.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-7B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \
    config=examples/config.yaml \
-    worker.actor.model.model_path=${MODEL_PATH}
+    data.train_files=hiyouga/math12k@train \
+    data.val_files=hiyouga/math12k@test \
+    worker.actor.model.model_path=${MODEL_PATH} \
+    trainer.experiment_name=qwen2_5_7b_math_grpo \
+    trainer.n_gpus_per_node=8
--- a/examples/qwen2_5_vl_32b_geo3k_grpo.sh
+++ b/examples/qwen2_5_vl_32b_geo3k_grpo.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-32B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \

--- a/examples/qwen2_5_vl_3b_geo3k_grpo.sh
+++ b/examples/qwen2_5_vl_3b_geo3k_grpo.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \

--- a/examples/qwen2_5_vl_7b_geo3k_grpo.sh
+++ b/examples/qwen2_5_vl_7b_geo3k_grpo.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \

--- a/examples/qwen2_5_vl_7b_geo3k_reinforce.sh
+++ b/examples/qwen2_5_vl_7b_geo3k_reinforce.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \

--- a/examples/qwen2_5_vl_7b_geo3k_swanlab.sh
+++ b/examples/qwen2_5_vl_7b_geo3k_swanlab.sh
-#!/bin/bash
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \

--- a/examples/qwen2_5_vl_7b_multi_image.sh
+++ b/examples/qwen2_5_vl_7b_multi_image.sh
-#!/bin/bash
 # REMINDER: this script uses test data split and should ONLY be used for debugging. DO NOT use for training.
 set -x
-export PYTHONUNBUFFERED=1
 MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
 python3 -m verl.trainer.main \

--- a/examples/qwen3_4b_math_grpo.sh
+++ b/examples/qwen3_4b_math_grpo.sh
-#!/bin/bash
-set -x
-export PYTHONUNBUFFERED=1
-MODEL_PATH=Qwen/Qwen3-4B  # replace it with your local file path
-python3 -m verl.trainer.main \
-    config=examples/config.yaml \
-    data.max_response_length=4096 \
-    worker.actor.model.model_path=${MODEL_PATH} \
-    trainer.experiment_name=qwen3_4b_math_grpo
--- a/examples/runtime_env.yaml
+++ b/examples/runtime_env.yaml
 working_dir: ./
 excludes: ["/.git/"]
 env_vars:
-  TOKENIZERS_PARALLELISM: "true"
+  TOKENIZERS_PARALLELISM: true
  NCCL_DEBUG: "WARN"
-  VLLM_LOGGING_LEVEL: "WARN"
+  VLLM_LOGGING_LEVEL: "INFO"
  TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
-  PYTHONUNBUFFERED: "1"
--- a/examples/reward_function/math.py
+++ b/examples/reward_function/math.py
@@ -13,34 +13,28 @@
 # limitations under the License.
 import re
-from typing import Dict, List
+from typing import Dict
 from mathruler.grader import extract_boxed_content, grade_answer
-def format_reward(predict: str) -> float:
+def format_reward(predict_str: str) -> float:
    pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
-    format_match = re.fullmatch(pattern, predict)
+    format_match = re.fullmatch(pattern, predict_str)
    return 1.0 if format_match else 0.0
-def accuracy_reward(predict: str, ground_truth: str) -> float:
+def accuracy_reward(predict_str: str, ground_truth: str) -> float:
-    answer = extract_boxed_content(predict)
+    answer = extract_boxed_content(predict_str)
    return 1.0 if grade_answer(answer, ground_truth) else 0.0
-def compute_score(predicts: List[str], ground_truths: List[str], format_weight: float = 0.1) -> List[Dict[str, float]]:
+def compute_score(predict_str: str, ground_truth: str, format_weight: float = 0.1) -> Dict[str, float]:
-    scores = []
+    predict_str = re.sub(r"\s*(<|>|/)\s*", r"\1", predict_str)  # handle qwen2.5vl-32b format
-    for predict, ground_truth in zip(predicts, ground_truths):
+    format_score = format_reward(predict_str)
-        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)  # handle qwen2.5vl-32b format
+    accuracy_score = accuracy_reward(predict_str, ground_truth)
-        format_score = format_reward(predict)
+    return {
-        accuracy_score = accuracy_reward(predict, ground_truth)
+        "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
-        scores.append(
+        "format": format_score,
-            {
+        "accuracy": accuracy_score,
-                "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
+    }
-                "format": format_score,
-                "accuracy": accuracy_score,
-            }
-        )
-    return scores