Commit 7f6cc211 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2874 failed with stages
in 0 seconds
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
- Preprocess data and split the training set into 75% for training RM and 25% for validting RM.
- All the training data is used to train SFT and RL.
- Both chosen and rejected is used to train SFT
"""
import argparse
import os
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm
from verl.utils.fs import copy, makedirs
def generate_sft_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/sft"):
dataset = load_dataset("Dahoas/full-hh-rlhf")
output = {"prompt": [], "response": []}
for data in tqdm(dataset["train"]):
# add chosen
output["prompt"].append(data["prompt"])
output["response"].append(data["chosen"])
# add rejection
output["prompt"].append(data["prompt"])
output["response"].append(data["rejected"])
df = pd.DataFrame(output)
local_dir = os.path.expanduser(local_dir)
os.makedirs(local_dir, exist_ok=True)
local_path = os.path.join(local_dir, "train.parquet")
df.to_parquet(path=local_path)
if target_hdfs_path_dir is not None:
hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet"
makedirs(hdfs_dir)
copy(local_path, hdfs_dir)
def generate_rm_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/rm"):
train_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[:75%]")
test_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[-25%:]")
local_dir = os.path.expanduser(local_dir)
os.makedirs(local_dir, exist_ok=True)
for dataset, name in zip([train_dataset, test_dataset], ["train", "test"], strict=True):
output = {"prompt": [], "chosen": [], "rejected": []}
for data in tqdm(dataset):
# add chosen
output["prompt"].append(data["prompt"])
output["chosen"].append(data["chosen"])
output["rejected"].append(data["rejected"])
df = pd.DataFrame(output)
local_path = os.path.join(local_dir, name + ".parquet")
df.to_parquet(path=local_path)
if target_hdfs_path_dir is not None:
hdfs_dir = target_hdfs_path_dir + "/" + name + ".parquet"
makedirs(hdfs_dir)
copy(local_path, hdfs_dir)
def generate_rl_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlhf/rl"):
dataset = load_dataset("Dahoas/full-hh-rlhf")
train_dataset = dataset["train"]
data_source = "Dahoas/full-hh-rlhf"
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
prompt = example.pop("prompt")
response = example.pop("response")
data = {
"data_source": data_source,
"prompt": [{"role": "user", "content": prompt}],
"ability": "alignment",
"reward_model": {
"style": "model",
"ground_truth": response, # should not be used
},
"extra_info": {"split": split, "index": idx},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
local_dir = os.path.expanduser(local_dir)
local_path = os.path.join(local_dir, "train.parquet")
train_dataset.to_parquet(local_path)
if target_hdfs_path_dir is not None:
hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet"
makedirs(hdfs_dir)
copy(local_path, hdfs_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--split", type=str, choices=["sft", "rm", "rl"], required=True)
parser.add_argument("--local_dir", type=str, default="~/data/full_hh_rlhf")
parser.add_argument("--hdfs_dir", type=str, required=False, default=None)
args = parser.parse_args()
if args.split == "sft":
generate_sft_dataset(args.hdfs_dir, os.path.join(args.local_dir, args.split))
elif args.split == "rm":
generate_rm_dataset(args.hdfs_dir, os.path.join(args.local_dir, args.split))
elif args.split == "rl":
generate_rl_dataset(args.hdfs_dir, os.path.join(args.local_dir, args.split))
else:
raise NotImplementedError
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the Geometry3k dataset to parquet format
"""
import argparse
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/geo3k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "hiyouga/geometry3k"
dataset = datasets.load_dataset(data_source)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = (
r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
r"The reasoning process MUST BE enclosed within <think> </think> tags. "
r"The final answer MUST BE put in \boxed{}."
)
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
problem = example.pop("problem")
prompt = problem + " " + instruction_following
answer = example.pop("answer")
images = example.pop("images")
data = {
"data_source": data_source,
"prompt": [
{
"role": "user",
"content": prompt,
}
],
"images": images,
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": answer},
"extra_info": {
"split": split,
"index": idx,
"answer": answer,
"question": problem,
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2023-2025 SGLang Team
# Copyright Amazon.com, Inc. or its affiliates.
# Copyright 2025 Reallm Labs Ltd. or its affiliates
# Copyright 2025 ModelBest Inc. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the Geometry3k dataset to parquet format
"""
import argparse
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/geo3k_multiturn_w_tool")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "hiyouga/geometry3k"
dataset = datasets.load_dataset(data_source)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = (
r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
r"The reasoning process MUST BE enclosed within <think> </think> tags. "
r"The final answer MUST BE put in \boxed{}."
)
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
problem = example.pop("problem")
prompt = problem + " " + instruction_following
answer = example.pop("answer")
images = example.pop("images")
data = {
"data_source": data_source,
"prompt": [
{
"role": "system",
"content": (
"You are a math expert. You are given a question and you need to solve it step by step. "
"Reasoning step by step before any tool call. "
"You should use the `calc_geo3k_reward` tool after step by step solving the question, "
"before generate final answer at least once and refine your answer if necessary. "
),
},
{
"role": "user",
"content": prompt,
},
],
"images": images,
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": answer},
"extra_info": {
"split": split,
"index": idx,
"answer": answer,
"question": problem,
"need_tools_kwargs": True,
"tools_kwargs": {
"calc_geo3k_reward": {
"create_kwargs": {"ground_truth": answer},
# "execute_kwargs": {},
# "calc_reward_kwargs": {},
# "release_kwargs": {},
},
},
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""
import argparse
import os
import re
import datasets
from verl.utils.hdfs_io import copy, makedirs
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split("#### ")[1].replace(",", "")
return final_solution
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = 'Let\'s think step by step and output the final answer after "####".'
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question_raw = example.pop("question")
question = question_raw + " " + instruction_following
answer_raw = example.pop("answer")
solution = extract_solution(answer_raw)
data = {
"data_source": data_source,
"prompt": [
{
"role": "user",
"content": question,
}
],
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": solution},
"extra_info": {
"split": split,
"index": idx,
"answer": answer_raw,
"question": question_raw,
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023-2024 SGLang Team
# Copyright 2025 ModelBest Inc. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""
import argparse
import os
import re
import datasets
from verl.utils.hdfs_io import copy, makedirs
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split("#### ")[1].replace(",", "")
return final_solution
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = "Let's think step by step and output the final answer after `####`."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question_raw = example.pop("question")
question = question_raw + " " + instruction_following
answer_raw = example.pop("answer")
solution = extract_solution(answer_raw)
data = {
"data_source": data_source,
"prompt": [
{
"role": "system",
"content": (
"You are a math expert. You are given a question and you need to solve it step by step. "
"You should rethinking carefully if user point out your answer is wrong. "
"Put your final answer in the format of `#### <answer>`."
),
},
{
"role": "user",
"content": question,
},
],
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": solution},
"extra_info": {
"split": split,
"index": idx,
"answer": answer_raw,
"question": question_raw,
"interaction_kwargs": {
"name": "gsm8k",
"query": question,
"ground_truth": solution,
},
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023-2024 SGLang Team
# Copyright 2025 ModelBest Inc. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""
import argparse
import os
import re
import datasets
from verl.utils.hdfs_io import copy, makedirs
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split("#### ")[1].replace(",", "")
return final_solution
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = "Let's think step by step and output the final answer after `####`."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question_raw = example.pop("question")
question = question_raw + " " + instruction_following
answer_raw = example.pop("answer")
solution = extract_solution(answer_raw)
data = {
"data_source": data_source,
"prompt": [
{
"role": "system",
"content": (
"You are a math expert. You are given a question and you need to solve it step by step. "
"Reasoning step by step before any tool call. "
"You should use the `calc_gsm8k_reward` tool after step by step solving the question, "
"before generate final answer at least once and refine your answer if necessary. "
"Put your final answer in the format of `#### <answer>`."
),
},
{
"role": "user",
"content": question,
},
],
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": solution},
"extra_info": {
"split": split,
"index": idx,
"answer": answer_raw,
"question": question_raw,
"need_tools_kwargs": True,
"tools_kwargs": {
"calc_gsm8k_reward": {
"create_kwargs": {"ground_truth": solution},
# "execute_kwargs": {},
# "calc_reward_kwargs": {},
# "release_kwargs": {},
},
},
"interaction_kwargs": {
"query": question,
"ground_truth": solution,
},
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023-2024 SGLang Team
# Copyright 2025 ModelBest Inc. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""
import argparse
import os
import re
import datasets
from verl.utils.hdfs_io import copy, makedirs
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split("#### ")[1].replace(",", "")
return final_solution
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/gsm8k")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "openai/gsm8k"
dataset = datasets.load_dataset(data_source, "main")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = "Let's think step by step and output the final answer after `####`."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question_raw = example.pop("question")
question = question_raw + " " + instruction_following
answer_raw = example.pop("answer")
solution = extract_solution(answer_raw)
data = {
"data_source": data_source,
"agent_name": "tool_agent",
"prompt": [
{
"role": "system",
"content": (
"You are a math expert. You are given a question and you need to solve it step by step. "
"Reasoning step by step before any tool call. "
"You should use the `calc_gsm8k_reward` tool after step by step solving the question, "
"before generate final answer at least once and refine your answer if necessary. "
"Put your final answer in the format of `#### <answer>`."
),
},
{
"role": "user",
"content": question,
},
],
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": solution},
"extra_info": {
"split": split,
"index": idx,
"answer": answer_raw,
"question": question_raw,
"need_tools_kwargs": True,
"tools_kwargs": {
"calc_gsm8k_reward": {
"create_kwargs": {"ground_truth": solution},
# "execute_kwargs": {},
# "calc_reward_kwargs": {},
# "release_kwargs": {},
},
},
"interaction_kwargs": {
"query": question,
"ground_truth": solution,
},
},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess Hellaswag dataset.
"""
import argparse
import os
import re
import datasets
from verl.utils.hdfs_io import copy, makedirs
def preprocess(text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="/opt/tiger/hellaswag")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
data_source = "Rowan/hellaswag"
dataset = datasets.load_dataset(data_source, trust_remote_code=True)
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]
instruction = "Please complete the following sentence.\n"
def make_map_fn(split):
def process_fn(doc, idx):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
query = preprocess(doc["activity_label"] + ": " + ctx)
choices = [preprocess(ending) for ending in doc["endings"]]
gold = int(doc["label"])
data = {
"data_source": data_source,
"prompt": [{"role": "user", "content": query}],
"ability": "nlp",
"reward_model": {
"style": "model",
"eval": "multiple_choice", # using loglikelihood
"ground_truth": gold,
"choices": choices,
},
"extra_info": {"split": split, "index": idx},
}
return data
return process_fn
# filter data that doesn't have a label
train_dataset = train_dataset.filter(lambda x: len(x["label"]) > 0)
val_dataset = val_dataset.filter(lambda x: len(x["label"]) > 0)
test_dataset = test_dataset.filter(lambda x: len(x["label"]) > 0)
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
val_dataset = val_dataset.map(function=make_map_fn("validation"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
val_dataset.to_parquet(os.path.join(local_dir, "validation.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the MATH-lighteval dataset to parquet format
"""
import argparse
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
from verl.utils.reward_score.math import last_boxed_only_string, remove_boxed
def extract_solution(solution_str):
return remove_boxed(last_boxed_only_string(solution_str))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/math")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
# 'lighteval/MATH' is no longer available on huggingface.
# Use mirror repo: DigitalLearningGmbH/MATH-lighteval
data_source = "DigitalLearningGmbH/MATH-lighteval"
print(f"Loading the {data_source} dataset from huggingface...", flush=True)
dataset = datasets.load_dataset(data_source, trust_remote_code=True)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question = example.pop("problem")
question = question + " " + instruction_following
answer = example.pop("solution")
solution = extract_solution(answer)
data = {
"data_source": data_source,
"prompt": [{"role": "user", "content": question}],
"ability": "math",
"reward_model": {"style": "rule", "ground_truth": solution},
"extra_info": {"split": split, "index": idx},
}
return data
return process_fn
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
if hdfs_dir is not None:
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Create a simple multi-turn dataset for testing
"""
import argparse
import os
import pandas as pd
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--local_dir", default="~/data/multiturn")
parser.add_argument("--hdfs_dir", default=None)
args = parser.parse_args()
# Create example conversations
conversations = []
# Conversation 1
conversations.append(
{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."},
{"role": "user", "content": "And what about Germany?"},
{"role": "assistant", "content": "The capital of Germany is Berlin."},
]
}
)
# Conversation 2
conversations.append(
{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Can you explain quantum computing?"},
{
"role": "assistant",
"content": "Quantum computing is a type of computing that uses quantum-mechanical phenomena, "
"such as superposition and entanglement, to perform operations on data.",
},
{"role": "user", "content": "How is it different from classical computing?"},
{
"role": "assistant",
"content": "Classical computing uses bits that are either 0 or 1, while quantum computing uses "
"quantum bits or qubits that can exist in multiple states simultaneously due to superposition.",
},
]
}
)
# Conversation 3
conversations.append(
{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a simple Python function to calculate factorial."},
{
"role": "assistant",
"content": (
"```python\ndef factorial(n):\n if n == 0 or n == 1:\n return 1\n else:\n "
"return n * factorial(n-1)\n```\n\nThis is a recursive function to calculate the "
"factorial of a number."
),
},
{"role": "user", "content": "Can you make it iterative instead?"},
{
"role": "assistant",
"content": (
"```python\ndef factorial(n):\n result = 1\n for i in range(1, n+1):\n "
"result *= i\n return result\n```\n\nThis is an iterative version of the factorial function."
),
},
]
}
)
# Create train and test datasets
train_data = conversations[:2] # First 2 conversations for training
test_data = conversations[2:] # Last conversation for testing
# Create output directory
local_dir = os.path.expanduser(args.local_dir)
os.makedirs(local_dir, exist_ok=True)
# Save to parquet files
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
train_df.to_parquet(os.path.join(local_dir, "train.parquet"))
test_df.to_parquet(os.path.join(local_dir, "test.parquet"))
# Handle HDFS if specified
if args.hdfs_dir is not None:
try:
from verl.utils.hdfs_io import copy, makedirs
makedirs(args.hdfs_dir)
copy(src=local_dir, dst=args.hdfs_dir)
except ImportError:
print("Warning: HDFS support not available. Skipping HDFS copy.")
# Print statistics
print(f"Train dataset size: {len(train_df)}")
print(f"Test dataset size: {len(test_df)}")
print(f"Data saved to {local_dir}")
if __name__ == "__main__":
main()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023-2024 SGLang Team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import tempfile
import pandas as pd
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from verl.utils.hdfs_io import copy, makedirs
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Configuration constants
DEFAULT_SYSTEM_CONTENT = "You are a helpful and harmless assistant."
DEFAULT_USER_CONTENT_PREFIX = (
"Answer the given question. You must conduct reasoning inside <think> and </think> "
"first every time you get new information. After reasoning, if you find you lack "
"some knowledge, you can call a search engine by <tool_call> query </tool_call> "
"and it will return the top searched results between <tool_response> and "
"</tool_response>. You can search as many times as your want. If you find no "
"further external knowledge needed, you can directly provide the answer inside "
"<answer> and </answer>, without detailed illustrations. For example, "
"<answer> Beijing </answer>. Question: "
)
def process_single_row(row, current_split_name, row_index):
"""
Process a single row of data for SearchR1-like format.
Args:
row: DataFrame row containing the original data
current_split_name: Name of the current split (train/test)
row_index: Index of the row in the DataFrame
Returns:
pd.Series: Processed row data in the required format
"""
question = row.get("question", "")
# Build prompt structure
user_content = user_content_prefix.rstrip("\n") + question
prompt = [{"role": "system", "content": system_content}, {"role": "user", "content": user_content}]
# Extract ground truth from reward_model or fallback to golden_answers
reward_model_data = row.get("reward_model")
if isinstance(reward_model_data, dict) and "ground_truth" in reward_model_data:
ground_truth = reward_model_data.get("ground_truth")
else:
ground_truth = row.get("golden_answers", [])
# Process data source
data_source_tagged = "searchR1_" + str(row.get("data_source", ""))
# Build tools kwargs structure
tools_kwargs = {
"search": {
"create_kwargs": {"ground_truth": ground_truth, "question": question, "data_source": data_source_tagged}
}
}
# Build complete extra_info structure
extra_info = {
"index": row_index,
"need_tools_kwargs": True,
"question": question,
"split": current_split_name,
"tools_kwargs": tools_kwargs,
}
return pd.Series(
{
"data_source": data_source_tagged,
"prompt": prompt,
"ability": row.get("ability"),
"reward_model": reward_model_data,
"extra_info": extra_info,
"metadata": row.get("metadata"),
}
)
def main():
local_save_dir = os.path.expanduser(args.local_dir)
os.makedirs(local_save_dir, exist_ok=True)
processed_files = []
# Download and process files using temporary directory
with tempfile.TemporaryDirectory() as tmp_download_dir:
for split in ["train", "test"]:
parquet_filename = f"{split}.parquet"
logger.info(f"Processing {split} split...")
try:
# Download Parquet file from HuggingFace
logger.info(f"Downloading {parquet_filename} from {args.hf_repo_id}")
local_parquet_filepath = hf_hub_download(
repo_id=args.hf_repo_id,
filename=parquet_filename,
repo_type="dataset",
local_dir=tmp_download_dir,
local_dir_use_symlinks=False,
)
# Load and process Parquet file
df_raw = pd.read_parquet(local_parquet_filepath)
logger.info(f"Loaded {len(df_raw)} rows from {parquet_filename}")
def apply_process_row(row, split_name=split):
return process_single_row(row, current_split_name=split_name, row_index=row.name)
df_processed = df_raw.apply(apply_process_row, axis=1)
# Save processed DataFrame
output_file_path = os.path.join(local_save_dir, f"{split}.parquet")
df_processed.to_parquet(output_file_path, index=False)
logger.info(f"Saved {len(df_processed)} processed rows to {output_file_path}")
processed_files.append(output_file_path)
except EntryNotFoundError:
logger.warning(f"{parquet_filename} not found in repository {args.hf_repo_id}")
except Exception as e:
logger.error(f"Error processing {split} split: {e}")
if not processed_files:
logger.warning("No data was processed or saved")
return
logger.info(f"Successfully processed {len(processed_files)} files to {local_save_dir}")
# Copy to HDFS if specified
if args.hdfs_dir:
try:
makedirs(args.hdfs_dir)
copy(src=local_save_dir, dst=args.hdfs_dir)
logger.info(f"Successfully copied files to HDFS: {args.hdfs_dir}")
except Exception as e:
logger.error(f"Error copying files to HDFS: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download Search-R1 from HuggingFace, process, and save to Parquet.")
parser.add_argument(
"--hf_repo_id", default="PeterJinGo/nq_hotpotqa_train", help="HuggingFace dataset repository ID."
)
parser.add_argument(
"--local_dir",
default="~/data/searchR1_processed_direct",
help="Local directory to save the processed Parquet files.",
)
parser.add_argument("--hdfs_dir", default=None, help="Optional HDFS directory to copy the Parquet files to.")
args = parser.parse_args()
# System and user content configuration
system_content = DEFAULT_SYSTEM_CONTENT
user_content_prefix = DEFAULT_USER_CONTENT_PREFIX
main()
set -x
data_path=$HOME/data/rlhf/gsm8k/test.parquet
save_path=$HOME/data/rlhf/math/deepseek_v2_lite_gen_test.parquet
model_path=deepseek-ai/deepseek-llm-7b-chat
python3 -m verl.trainer.main_generation \
trainer.nnodes=2 \
trainer.n_gpus_per_node=8 \
data.path=$data_path \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path=$save_path \
model.path=$model_path\
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size=16 \
rollout.gpu_memory_utilization=0.8
set -x
data_path=$HOME/data/gsm8k/test.parquet
save_path=$HOME/data/gsm8k/deepseek_v2_lite_gen_test.parquet
model_path=deepseek-ai/deepseek-llm-7b-chat
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=8 \
data.path=$data_path \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path=$save_path \
model.path=$model_path \
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size=2 \
rollout.gpu_memory_utilization=0.8
<div align=center>
# Geometric-Mean Policy Optimization
</div>
This is the official implementaion of paper [***Geometric-Mean Policy Optimization***](https://arxiv.org/abs/2507.20673).
<div align=center>
<img width="3092" height="864" alt="image" src="https://github.com/user-attachments/assets/af4c7e0f-923a-45ef-9bcf-57109b8ee61e" />
</div>
## 1. Contents
- Geometric-Mean Policy Optimization
- [1. Contents](#1-contents)
- [2. Introduction](#2-introduction)
- [3. Code Usage](#4-code-usage)
- [4. Contacts](#5-contacts)
- [5. Citation](#7-citation)
## 2. Introduction
Recent advancements, such as Group Relative Policy Optimization (GRPO), have enhanced the reasoning capabilities of large language models by optimizing the arithmetic mean of token-level rewards. However, GRPO suffers from unstable policy updates when processing tokens with outlier importance-weighted rewards, which manifests as extreme importance sampling ratios during training, i.e., the ratio between the sampling probabilities assigned to a token by the current and old policies. In this work, we propose Geometric-Mean Policy Optimization (GMPO), a stabilized variant of GRPO. Instead of optimizing the arithmetic mean, GMPO maximizes the geometric mean of token-level rewards, which is inherently less sensitive to outliers and maintains a more stable range of importance sampling ratio. In addition, we provide comprehensive theoretical and experimental analysis to justify the design and stability benefits of GMPO. Beyond improved stability, GMPO-7B outperforms GRPO by an average of 4.1% on multiple mathematical benchmarks and 1.4% on multimodal reasoning benchmark, including AIME24, AMC, MATH500, OlympiadBench, Minerva, and Geometry3K.
## 3. Code Usage
The key configurations are:
```
clip_ratio_low=0.4
clip_ratio_high=0.4
loss_mode=geo_mean
```
To get started quickly, run:
```
bash examples/gmpo_trainer/run_qwen2_5-7b_math.sh
```
GMPO can be combined with other methods such as DAPO (experimental - not fully tested):
```
bash examples/gmpo_trainer/test_dapo_7b_math.sh
bash examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh
```
## 4. Contacts
If you have any question about our work or this repository, please don't hesitate to contact us by emails or open an issue under this project.
- [zhaoyuzhong20@mails.ucas.ac.cn](zhaoyuzhong20@mails.ucas.ac.cn)
- [liuyue171@mails.ucas.ac.cn](liuyue171@mails.ucas.ac.cn)
- [lecu@microsoft.com](lecu@microsoft.com)
- [wanfang@ucas.ac.cn](wanfang@ucas.ac.cn)
## 5. Citation
```
@misc{zhao2025geometricmeanpolicyoptimization,
title={Geometric-Mean Policy Optimization},
author={Yuzhong Zhao and Yue Liu and Junpeng Liu and Jingye Chen and Xun Wu and Yaru Hao and Tengchao Lv and Shaohan Huang and Lei Cui and Qixiang Ye and Fang Wan and Furu Wei},
year={2025},
eprint={2507.20673},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2507.20673},
}
```
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
use_kl_loss=False
loss_mode=geo_mean
clip_ratio=0.4
save_contents="['model', 'optimizer', 'extra']"
export WANDB_MODE=offline
save_contents="['hf_model']"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-Math-7B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.checkpoint.save_contents=${save_contents} \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_gmpo_example_gsm8k_math' \
trainer.experiment_name='qwen2_5_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
#!/usr/bin/env bash
set -xeuo pipefail
project_name='DAPO'
exp_name='DAPO-Qwen2.5-7b-MATH-0527a1'
adv_estimator=grpo
use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0
clip_ratio_low=0.4
clip_ratio_high=0.4
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 8))
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
loss_agg_mode="token-mean"
train_prompt_bsz=512
n_resp_per_prompt=16
train_prompt_mini_bsz=32
# Ray
# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-8}
NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
# Paths
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=0.7
# Performance Related Parameter
sp_size=4
use_dynamic_bsz=True
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
offload=True
gen_tp=4
fsdp_size=32
loss_mode=geo_mean
# export WANDB_MODE=offline
save_contents="['model', 'optimizer', 'extra']"
# save_contents="['hf_model']"
# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
python3 -m verl.trainer.main_ppo \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
data.truncation='left' \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.train_batch_size=${train_prompt_bsz} \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
algorithm.adv_estimator=${adv_estimator} \
algorithm.use_kl_in_reward=${use_kl_in_reward} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.model.use_remove_padding=True \
+actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.temperature=${temperature} \
actor_rollout_ref.rollout.top_p=${top_p} \
actor_rollout_ref.rollout.top_k=${top_k} \
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \
reward_model.reward_manager=dapo \
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
trainer.logger='["console","wandb"]' \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
trainer.nnodes="${NNODES}" \
trainer.val_before_train=True \
trainer.test_freq=10 \
trainer.save_freq=10 \
trainer.total_epochs=10 \
trainer.total_training_steps=200 \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.resume_mode=auto \
trainer.log_val_generations=10
#!/usr/bin/env bash
set -xeuo pipefail
project_name='DAPO'
exp_name='DAPO-Qwen3-30B-A3B-Base-MATH-0527a1'
adv_estimator=grpo
use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=False
kl_loss_coef=0.0
clip_ratio_low=0.4
clip_ratio_high=0.4
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 8))
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
loss_agg_mode="token-mean"
train_prompt_bsz=512
n_resp_per_prompt=16
train_prompt_mini_bsz=32
loss_mode=geo_mean
# export WANDB_MODE=offline
save_contents="['model', 'optimizer', 'extra']"
# save_contents="['hf_model']"
# Ray
# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-8}
NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
# Paths
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=0.7
# Performance Related Parameter
sp_size=4
use_dynamic_bsz=True
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
offload=True
gen_tp=4
fsdp_size=32
python3 -m verl.trainer.main_ppo \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
data.truncation='left' \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.train_batch_size=${train_prompt_bsz} \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
algorithm.adv_estimator=${adv_estimator} \
algorithm.use_kl_in_reward=${use_kl_in_reward} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.temperature=${temperature} \
actor_rollout_ref.rollout.top_p=${top_p} \
actor_rollout_ref.rollout.top_k=${top_k} \
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \
reward_model.reward_manager=dapo \
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
trainer.logger='["console","wandb"]' \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
trainer.nnodes="${NNODES}" \
trainer.val_before_train=True \
trainer.test_freq=10 \
trainer.save_freq=10 \
trainer.total_epochs=10 \
trainer.total_training_steps=300 \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.resume_mode=auto \
trainer.log_val_generations=10
# GPG: Group Policy Gradient
Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning
](https://arxiv.org/abs/2504.02546).
## Key Components
- Use a corrected advantage function to improve policy gradient accuracy and training efficiency.
- By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO)
## Configuration
To configure GPG within the framework, use the following YAML settings.
```yaml
algorithm:
adv_estimator: gpg
actor_rollout_ref:
actor:
policy_loss:
loss_mode: "gpg"
```
## Advanced Extensions
GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance.
```yaml
algorithm:
adv_estimator: gpg
actor_rollout_ref:
actor:
use_kl_loss: True # enable kl regularization
kl_loss_coef: 0.01
policy_loss:
loss_mode: "gpg"
```
\ No newline at end of file
set -x
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
# export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gpg \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.actor.policy_loss.loss_mode=gpg \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_gpg_example_gsm8k_math' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
# export VLLM_ATTENTION_BACKEND=XFORMERS
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=gpg \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.actor.policy_loss.loss_mode=gpg \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_gpg_example_gsm8k_math' \
trainer.experiment_name='qwen2_7b_megatron' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment