Commit 53b3977b authored by dongchy920's avatar dongchy920
Browse files

Initial commit

parents
Pipeline #2841 failed with stages
in 0 seconds
## Setup
1. **Create a Conda Environment**
Use the following command to create and activate a new environment for the DPO training:
```bash
conda create -n dpo_env python=3.10
conda activate dpo_env
```
2. **Install Dependencies**
After activating the environment, install all required dependencies by running:
```bash
pip install -r requirements.txt
```
3. **Constructing DPO Data**
Provide the data for DPO trainng as follow:
the jsonl file contains json object (each line).
```json
{
{"prompt": "Prompt"},
{"chosen": "The chosen response"},
{"rejected": "The rejected response"},
}
```
4. **Training**
Once the environment is ready and the model paths are configured, run the DPo training by executing the following script:
```
DATA_PATH="/path/to/preference/data"
SFT_MODEL="/path/to/sft/model"
OUTPUT_DIR="/path/to/output"
bash ./scripts/dpo_qwencoder.sh
```
{
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu" :"auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 1
}
}
\ No newline at end of file
{
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu" :"auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true
}
}
\ No newline at end of file
{
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu" :"auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
}
}
accelerate==1.0.1
datasets==2.18.0
deepspeed==0.12.6+c00388a2
torch==2.2.0a0+81ea7a4
tqdm==4.66.1
transformers==4.44.2
trl==0.11.4
#!/bin/bash
EXP_NAME=${1}
DATA_PATH=${2}
SFT_MODEL=${3}
OUTPUT_DIR=${4}
LOGFILE_PATH="logs/${EXP_NAME}.log"
mkdir logs
echo "Output Path" ${OUTPUT_DIR}
echo "Training Data" ${DATA_PATH}
echo "SFT Model" ${SFT_MODEL}
BETA=0.1
LR=5e-6
WARMUP_RATIO=0.1
deepspeed train.py \
--deepspeed "./configs/ds_config_zero3.json" \
--model_name_or_path ${SFT_MODEL} \
--dataset_name ${DATA_PATH} \
--output_dir ${OUTPUT_DIR} \
--save_strategy "epoch" \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--evaluation_strategy "steps" \
--eval_steps 100 \
--num_train_epochs 1 \
--gradient_accumulation_steps 2 \
--gradient_checkpointing True \
--learning_rate ${LR} \
--beta ${BETA} \
--warmup_ratio ${WARMUP_RATIO} \
--logging_steps 1 \
--max_length 2048 \
--bf16 True \
--tf32 True >> ${LOGFILE_PATH} 2>&1
\ No newline at end of file
import os
os.environ["WANDB_MODE"] = "offline"
import multiprocessing
import torch
from datasets import load_dataset
from tqdm import tqdm
import transformers
import trl
from trl.trainer import DPOConfig,DPOTrainer,ModelConfig
from trl.commands.cli_utils import DPOScriptArguments, TrlParser
from transformers import set_seed
from utils import init_logger
import json
from datasets import disable_caching
disable_caching()
set_seed(1234)
tqdm.pandas()
def train():
parser = TrlParser((DPOScriptArguments, DPOConfig, ModelConfig))
args, training_args, model_config = parser.parse_args_and_config()
logger = init_logger(
os.path.join(training_args.output_dir, 'train.log'),
training_args.local_rank
)
logger.info(f'model args: {model_config}')
logger.info(f'args: {args}')
logger.info(f'training args: {training_args}')
model = transformers.AutoModelForCausalLM.from_pretrained(
model_config.model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16
)
model_ref = transformers.AutoModelForCausalLM.from_pretrained(
model_config.model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=False, trust_remote_code=True, model_max_length=training_args.max_length)
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token})
tokenizer.bos_token_id = tokenizer.eos_token_id
train_dataset = load_dataset('json', data_files=args.dataset_name)
train_test_split = train_dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']
def process(row):
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": row["prompt"]}
]
example = {
'prompt': tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True),
'chosen': row["chosen"],
'rejected': row["rejected"],
}
return example
train_dataset = train_dataset.map(
process,
num_proc=multiprocessing.cpu_count(),
load_from_cache_file=False,
)
test_dataset = test_dataset.map(
process,
num_proc=multiprocessing.cpu_count(),
load_from_cache_file=False,
)
trainer = DPOTrainer(
model,
model_ref,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)
if __name__ == "__main__":
train()
import os
import logging
import transformers
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments
def init_logger(fpath='', local_rank=0):
if transformers.trainer_utils.is_main_process(local_rank):
if fpath:
if os.path.dirname(fpath):
os.makedirs(os.path.dirname(fpath), exist_ok=True)
file_handler = logging.FileHandler(fpath, mode='a') # to file
transformers.logging.add_handler(file_handler)
transformers.logging.set_verbosity_info()
else:
transformers.logging.set_verbosity_error() # reduce
transformers.logging.enable_explicit_format()
return transformers.logging.get_logger()
def add_custom_callback(trainer, logger):
if 'PrinterCallback' in trainer.callback_handler.callback_list:
trainer.pop_callback(transformers.PrinterCallback)
trainer.add_callback(LogCallback(logger))
logger.info('Add custom LogCallback')
logger.info(f"trainer's callbacks: {trainer.callback_handler.callback_list}")
class LogCallback(transformers.TrainerCallback):
"""
A bare :class:`~transformers.TrainerCallback` that just prints with logger.
"""
def __init__(self, logger, exclude=('total_flos', 'epoch')):
self.logger = logger
self.exclude = exclude
def on_log(self, args, state, control, logs=None, **kwargs):
if state.is_world_process_zero:
self.logger.info(''.join([
f"[global_steps={state.global_step}]",
f"[epochs={logs['epoch']}]",
','.join(f'{k}={v}' for k, v in logs.items()
if k not in self.exclude)
]))
class DatasetUpdateCallback(transformers.TrainerCallback):
def __init__(self, trainer):
self.trainer = trainer
def on_epoch_begin(self, args, state, control, **kwargs):
sampler = self.trainer.callback_handler.train_dataloader.sampler
self.trainer.train_dataset.update(sampler.epoch)
class SaveDiskCallback(transformers.TrainerCallback):
def on_save(self, args, state, control, **kwargs):
if args.local_rank != 0:
return
for ckpt in os.listdir(args.output_dir):
# remove out-of-date deepspeed checkpoints
if ckpt.startswith('checkpoint-') and not ckpt.endswith(f'-{state.global_step}'):
for pattern in ['global_step*', '*.pth']:
os.system("rm -rf " + os.path.join(args.output_dir, ckpt, pattern))
def on_train_end(self, args, state, control, **kwargs):
if state.is_local_process_zero:
for pattern in ['global_step*', '*.pth']:
os.system("rm -rf " + os.path.join(args.output_dir, "checkpoint-*", pattern))
## Setup
1. **Create a Conda Environment**
Use the following command to create and activate a new environment for the SFT training:
```bash
conda create -n sft_env python=3.9
conda activate sft_env
```
2. **Install Dependencies**
After activating the environment, install all required dependencies by running:
```bash
pip install -r requirements.txt
```
3. **Binarize Data**
Provide the raw data as follow:
the raw jsonl file contains json object (each line).
```json
{
"messages":[
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": "Write a regex expression to match any letter of the alphabet"},
{"role": "assistant", "content": "The regex expression to match any letter of the alphabet (either in uppercase or lowercase) is: \n\n```regex\n[a-zA-Z]\n```"},
{"role": "user", "content": "How about if I only want to match uppercase letters? Can you modify the regex expression for that?"},
{"role": "assistant", "content": "Sure, the regex expression to match any uppercase letter of the alphabet is:\n\n```regex\n[A-Z]\n```"}
],
"format": "chatml"
}
```
Binarize the raw data:
```bash
INPUT_PATH="/path/to/raw/sft.jsonl"
OUTPUT_PATH="/path/to/processed/sft.jsonl"
TOKENIZER_PATH="/path/to/pretrained_models/Qwen/Qwen2___5-Coder-1___5B/"
bash ./scripts/binarize_data.sh
```
4. **Training**
Once the environment is ready and the model paths are configured, run the evaluation suite by executing the following script:
```bash
DATA_PATH="/path/to/processed/sft.jsonl"
PRETRAINED_MODEL="/path/to/pretrained_models/Qwen/Qwen2___5-Coder-1___5B/"
OUTPUT_DIR="/path/to/checkpoints/sft_model/"
bash ./scripts/sft_qwencoder.sh
```
import jsonlines
import os
import numpy as np
import transformers
import tqdm
import sys
from typing import Dict
import argparse
import itertools
import json
from utils import utils
IGNORE_INDEX = -100 #default ignore_index = 100 in transformers
# Set special tokens globally to avoid adding them multiple times.
def setup_tokenizer(tokenizer):
tokenizer.add_special_tokens({
"additional_special_tokens": [
"<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|repo_name|>",
"<|file_sep|>", "<|im_start|>", "<|im_end|>"
]
})
return tokenizer
def chatml_format_preprocess(sources,
tokenizer: transformers.PreTrainedTokenizer, max_len: int,
system_message: str = "You are a helpful assistant.",
only_last_turn_loss=False,
return_test_input_ids = False
) -> Dict:
roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
im_start = tokenizer("<|im_start|>").input_ids[0]
im_end = tokenizer("<|im_end|>").input_ids[0]
nl_tokens = tokenizer('\n').input_ids
if len(nl_tokens) > 0:
nl_tokens = nl_tokens[-1:]
_system = tokenizer('system').input_ids + nl_tokens
_user = tokenizer('user').input_ids + nl_tokens
_assistant = tokenizer('assistant').input_ids + nl_tokens
input_id, target, test_input_ids = [], [], []
if sources[0]["content"] != "" and sources[0]["role"] == "system":
system_message = sources[0]["content"]
system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
input_id += system
test_input_ids += system
target += [im_start] + [IGNORE_INDEX] * (len(system) - 3) + [im_end] + nl_tokens
assert len(input_id) == len(target), "Input and target lengths do not match."
for j, sentence in enumerate(sources[1:]):
role = roles.get(sentence["role"])
if not role:
raise ValueError(f"Unknown role '{sentence['role']}' encountered.")
_input_id = tokenizer(role).input_ids + nl_tokens + tokenizer(sentence["content"], add_special_tokens=False).input_ids + [im_end] + nl_tokens
input_id += _input_id
if role == '<|im_start|>user' or (only_last_turn_loss and j < len(sources[1:]) - 1):
_target = [im_start] + [IGNORE_INDEX] * (len(_input_id) - 3) + [im_end] + nl_tokens
elif role == '<|im_start|>assistant':
_target = [im_start] + [IGNORE_INDEX] * len(tokenizer(role).input_ids) + _input_id[len(tokenizer(role).input_ids) + 1: -2] + [im_end] + nl_tokens
else:
raise NotImplementedError(f"Role '{role}' is not implemented.")
target += _target
if j == len(sources[1:]) - 1:
test_input_ids += tokenizer(role).input_ids + nl_tokens
else:
test_input_ids += _input_id
assert len(input_id) == len(target), "Final input and target lengths do not match."
if len(input_id) > max_len:
return None
if return_test_input_ids:
return dict(
test_input_ids=test_input_ids,
input_ids=input_id,
label=target,
)
else:
return dict(
input_ids=input_id,
label=target,
length=[len(input_id)]
)
def read_file_from_position_with_chatml_format_processor(args):
filename, start_position, end_position, worker_id, args = args
tokenizer = args["tokenizer"]
max_len = args["max_len"]
objs = []
with open(filename, 'r', encoding='utf-8', errors='replace') as f: # Using 'replace' to handle errors better
current_position = utils.find_next_line(f, start_position)
f.seek(current_position)
if current_position >= end_position:
print(f"worker_id {worker_id} completed")
return objs
for cnt in tqdm.tqdm(itertools.count(), position=worker_id, desc=f"worker_id: {worker_id}"):
line = f.readline()
if not line:
break
try:
obj = json.loads(line)
except:
print("Invalid json!")
continue
obj = chatml_format_preprocess(
obj["messages"], tokenizer, max_len=max_len,
only_last_turn_loss=obj.get("only_last_turn_loss", True)
)
if obj is not None:
objs.append(obj)
if f.tell() >= end_position:
break
print(f"worker_id {worker_id} completed")
return objs
def convert_to_uint32(x):
return np.array(x, dtype = np.uint32)
def convert_to_int32(x):
return np.array(x, dtype = np.int32)
def save_mmap(objs, key, output_path, padding_value):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
data = []
max_length = 0
for obj in tqdm.tqdm(objs):
vec = obj[key]
data.append(vec)
max_length = max(max_length, len(vec))
n_samples = len(data)
utils.save_json(data = {
"n_samples": n_samples,
"max_len": max_length,
}, output_path=f"{output_path}.shape.json")
# Create mmap
data_shape = (n_samples, max_length)
data_mmap = np.memmap(
output_path,
dtype=np.int32,
mode='w+',
shape=data_shape
)
for i, vec in enumerate(data):
padded_vec = vec + [padding_value] * (max_length - len(vec))
data_mmap[i] = padded_vec
data_mmap.flush()
def tokenize_file(workers=64, chunk_size=10000, input_path="./raw/sft.jsonl", output_path="./processed/sft.jsonl", tokenizer=None, max_len=32768, save_format = ".npy"):
output_objs = utils.multi_tasks_from_file(input_path, workers=workers, task=read_file_from_position_with_chatml_format_processor, chunk_size=chunk_size, args={"tokenizer": tokenizer, "max_len": max_len})
if save_format == ".jsonl":
utils.write_jsonl_file(output_objs, output_path)
print(f"Successfully saved to {output_path}")
elif save_format == ".npy":
for obj in output_objs:
obj["input_ids"] = convert_to_uint32(obj["input_ids"])
obj["label"] = convert_to_int32(obj["label"])
if "test_input_ids" in obj:
obj["test_input_ids"] = convert_to_uint32(obj["test_input_ids"])
np.save(f"{output_path}.npy", output_objs, allow_pickle=True)
print(f"Successfully saved to {output_path}.npy")
elif save_format == ".mmap":
save_mmap(output_objs, key = "input_ids", output_path = f"{output_path}.input_ids.mmap", padding_value = tokenizer.pad_token_id)
save_mmap(output_objs, key = "label", output_path = f"{output_path}.labels.mmap", padding_value = IGNORE_INDEX)
save_mmap(output_objs, key = "length", output_path = f"{output_path}.lengths.mmap", padding_value = IGNORE_INDEX)
print(f"Successfully saved to {output_path}.input_ids.mmap and {output_path}.label.mmap and {output_path}.lengths.mmap")
def parse_args():
parser = argparse.ArgumentParser(description='Argument Parser Example')
parser.add_argument('--input_path', '-input_path', type=str, default="./raw/sft.jsonl.sampled", help='Path to input file')
parser.add_argument('--output_path', '-output_path', type=str, default="./raw/sft.jsonl.sampled.processed", help='Path to output file')
parser.add_argument('--workers', '-workers', type=int, default=1, help='Number of workers')
parser.add_argument('--chunk_size', '-chunk_size', type=float, default=0.1 * 2 ** 30, help='Chunk size for file processing')
parser.add_argument('--max_len', '-max_len', type=int, default=8192, help='Maximum length for tokenization')
parser.add_argument('--tokenizer_path', '-tokenizer_path', type=str, default="./pretrained_models/qwen/Qwen2.5-Coder-7B/", help='Path to tokenizer')
parser.add_argument('--save_format', '-save_format', type=str, default=".np", help='Path to tokenizer')
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
print(args)
tokenizer = transformers.AutoTokenizer.from_pretrained(
args.tokenizer_path,
add_eos_token=False,
add_bos_token=False,
pad_token='<|endoftext|>',
eos_token='<|im_end|>',
cache_dir=None,
model_max_length=8192 * 5,
truncation=True,
padding_side="right",
trust_remote_code=True
)
tokenizer = setup_tokenizer(tokenizer) # Set special tokens once
tokenize_file(workers=args.workers, chunk_size=args.chunk_size, input_path=args.input_path, output_path=args.output_path, tokenizer=tokenizer, max_len=args.max_len, save_format = args.save_format)
{
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 10,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
from modelscope.hub.snapshot_download import snapshot_download
model_dir = snapshot_download('Qwen/Qwen2.5-Coder-1.5B', cache_dir='./pretrained_models/')
model_dir = snapshot_download('Qwen/Qwen2.5-Coder-7B', cache_dir='./pretrained_models/')
\ No newline at end of file
from pathlib import Path
import sys
parent_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(parent_dir))
from utils import utils
from utils import code_execute_multiple
import tqdm
import re
import argparse
def pack_code(code, programming_language):
code_lines = code.splitlines()
for line in code_execute_multiple.IMPORT_HELPER[programming_language]:
if line not in code_lines:
code_lines.insert(0, line)
code = "\n".join(code_lines)
return code
def remove_irrelevant_code(code, entry_point):
code = code.replace("\t", " ") # blan
code_lines = code.splitlines()
new_code_lines = []
skip_tag = False
indent = 0
for line in code_lines:
if skip_tag and not line[indent:].startswith(" "):
skip_tag = False
if line.strip().startswith("def ") and not line.strip().startswith(f"def {entry_point}"):
indent = len(line.split("def ")[0])
skip_tag = True
if not skip_tag:
new_code_lines.append(line)
code = "\n".join(new_code_lines)
if "\ncheck_correctness()" not in code:
code += "\n" + "check_correctness()"
return code
def execute_code_task(objs, worker_id=0, workers=1, args = None):
output_objs = []
for obj in tqdm.tqdm(objs, position=worker_id, desc=f"Worker {worker_id}"):
question = obj["messages"][1]["content"]
answer = obj["gpt-4o_response"]
unit_test = obj["gpt-4o_unittest"]
answer_match = re.search(r"```.*?\n(.*?)```", answer, flags=re.DOTALL)
unittest_match = re.search(r"```.*?\n(.*?)```", unit_test, flags=re.DOTALL)
programming_language = obj["language"]
if answer_match is not None and unittest_match is not None:
unittest_code = unittest_match.group(1)
unittest_code = remove_irrelevant_code(unittest_code, entry_point = "check_correctness")
answer_code = answer_match.group(1)
answer_code = pack_code(answer_code, programming_language)
code = answer_code + "\n" + unittest_code
if code_execute_multiple.check_correctness_multiple(code, programming_language):
output_objs.append({
"question": question,
"answer": answer,
"answer_code": answer_code,
"unittest_code": unittest_code,
"unittest": unit_test
})
print(f"worker {worker_id} finished...")
return output_objs
def parse_args():
parser = argparse.ArgumentParser(description='Argument Parser Example')
parser.add_argument('--input_path', '-input_path', type=str, default="python_evol.jsonl", help='Path to input file')
parser.add_argument('--output_path', '-output_path', type=str, default="python_evol.jsonl.unittest", help='Path to output file')
parser.add_argument('--workers', '-workers', type=int, default = 1, help='Path to output file')
args = parser.parse_args()
return args
def main():
args = parse_args()
objs = utils.read_jsonl_file(args.input_path)
objs = utils.multi_tasks_from_objs(objs, workers = args.workers, task = execute_code_task, chunk_size=None, args = None)
utils.write_jsonl_file(objs, args.output_path)
if __name__ == "__main__":
main()
absl-py==2.1.0
accelerate==0.33.0
annotated-types==0.7.0
attrs==24.2.0
certifi==2024.7.4
charset-normalizer==3.3.2
click==8.1.7
deepspeed==0.14.5
et-xmlfile==1.1.0
filelock==3.15.4
fire==0.6.0
fsspec==2024.6.1
h5py==3.11.0
hjson==3.1.0
huggingface-hub==0.24.5
idna==3.7
Jinja2==3.1.4
joblib==1.4.2
jsonlines==4.0.0
MarkupSafe==2.1.5
mpmath==1.3.0
networkx==3.2.1
ninja==1.11.1.1
nltk==3.9
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-ml-py==12.560.30
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.20
nvidia-nvtx-cu12==12.1.105
openai==0.9.0
openpyxl==3.1.5
packaging==24.1
pandas==2.2.2
pandas-stubs==2.2.2.240807
pillow==10.4.0
protobuf==5.27.3
psutil==6.0.0
py-cpuinfo==9.0.0
pydantic==2.8.2
pydantic_core==2.20.1
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.2
regex==2024.7.24
requests==2.32.3
rouge_score==0.1.2
safetensors==0.4.4
sentencepiece==0.2.0
six==1.16.0
sympy==1.13.2
tensorboardX==2.6.2.2
termcolor==2.4.0
tokenizers==0.15.2
torch==2.4.0
tqdm==4.66.5
transformers==4.37.0
triton==3.0.0
types-pytz==2024.1.0.20240417
typing_extensions==4.12.2
tzdata==2024.1
urllib3==2.2.2
export PATH=/path/to/miniconda3/envs/qwen/bin:$PATH;
cd ./Qwen2.5-Coder-evaluation/sft/;
INPUT_PATH=${1}
OUTPUT_PATH=${2}
TOKENIZER_PATH=${3}
INPUT_PATH=${INPUT_PATH:-"./raw/sft.jsonl"}
OUTPUT_PATH=${OUTPUT_PATH:-"./processed/sft.jsonl"}
TOKENIZER_PATH=${TOKENIZER_PATH:-"./pretrained_models/Qwen/Qwen2___5-Coder-1___5B/"}
python binarize_data.py -input_path ${INPUT_PATH} -output_path ${OUTPUT_PATH} -workers 64 -tokenizer_path ${TOKENIZER_PATH}
\ No newline at end of file
export NCCL_IB_TC=136
export NCCL_IB_SL=5
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=bond0
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5
export NCCL_IB_TIMEOUT=22
export NCCL_IB_QPS_PER_CONNECTION=8
export NCCL_NET_PLUGIN=none
export PATH=/path/to/miniconda3/envs/qwen/bin:$PATH;
DATA_PATH=${1}
PRETRAINED_MODEL=${2}
OUTPUT_DIR=${3}
DATA_PATH=${DATA_PATH:-"/path/to/processed/sft.jsonl"}
PRETRAINED_MODEL=${PRETRAINED_MODEL:-"/path/to/pretrained_models/Qwen/Qwen2___5-Coder-1___5B/"}
OUTPUT_DIR=${OUTPUT_DIR:-"/path/to/checkpoints/lr${LR}-wr${WARMUP_STEPS}-wd${WEIGHT_DECAY}-bsz${BATCH_SIZE}-maxlen${MAX_LENGTH}/"}
GPUS_PER_NODE=$(python -c "import torch; print(torch.cuda.device_count());")
MASTER_ADDR=${MASTER_ADDR:-localhost}
NNODES=${WORLD_SIZE:-1}
NODE_RANK=${RANK:-0}
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
MASTER_PORT=${MASTER_PORT:-6105}
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
DEEPSPEED_CONFIG="./configs/default_offload_opt_param.json"
BATCH_SIZE=1024
MICRO_BATCH_SIZE=4
GRAD_ACCU=$(($BATCH_SIZE / $WORLD_SIZE / $MICRO_BATCH_SIZE))
LR=5e-5
MIN_LR=5e-6
WARMUP_STEPS=100
WEIGHT_DECAY=0.0
MAX_LENGTH=1280
echo $OUTPUT_DIR
echo "Pretrained Model" ${PRETRAINED_MODEL}
echo "WORLD_SIZE" $WORLD_SIZE "MICRO BATCH SIZE" $MICRO_BATCH_SIZE "GRAD_ACCU" $GRAD_ACCU
echo $DISTRIBUTED_ARGS
cd ROOT_PATH="/path/to/sft/";
torchrun ${DISTRIBUTED_ARGS} train.py \
--model_name_or_path ${PRETRAINED_MODEL} \
--data_path $DATA_PATH \
--model_max_length ${MAX_LENGTH} \
--output_dir ${OUTPUT_DIR} \
--num_train_epochs 3 \
--per_device_train_batch_size ${MICRO_BATCH_SIZE} \
--gradient_accumulation_steps ${GRAD_ACCU} \
--per_device_eval_batch_size 4 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 100 \
--save_total_limit 100 \
--learning_rate ${LR} \
--weight_decay ${WEIGHT_DECAY} \
--warmup_steps ${WARMUP_STEPS} \
--lr_scheduler_type "cosine" \
--logging_strategy "steps" \
--logging_steps 1 \
--deepspeed ${DEEPSPEED_CONFIG} \
--report_to "tensorboard" \
--bf16 True \
--tf32 True \
--truncate_source False
\ No newline at end of file
import copy
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence
import argparse
import torch
import transformers
import utils
from torch.utils.data import Dataset
from transformers import Trainer
import torch.distributed as dist
import sys
import os
import numpy as np
from utils import utils
from utils import training_datasets
IGNORE_INDEX = -100 #default ignore_index = 100 in transformers
logging.basicConfig(level=logging.DEBUG)
@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
@dataclass
class DataArguments:
data_path: str = field(default=None, metadata={"help": "Path to the training data."})
@dataclass
class TrainingArguments(transformers.TrainingArguments):
cache_dir: Optional[str] = field(default=None)
optim: str = field(default="adamw_torch")
model_max_length: int = field(
default=512,
metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
)
truncate_source: bool = field(default=False)
def smart_tokenizer_and_embedding_resize(
special_tokens_dict: Dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings = model.get_input_embeddings().weight.data
output_embeddings = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
"""Tokenize a list of strings."""
tokenized_list = [
tokenizer(
text,
return_tensors="pt",
padding="longest",
max_length=tokenizer.model_max_length,
truncation=True,
)
for text in strings
]
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
]
return dict(
input_ids=input_ids,
labels=labels,
input_ids_lens=input_ids_lens,
labels_lens=labels_lens,
)
@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
return dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)
def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""
if args.data_path.endswith(".npy") or args.data_path.endswith(".jsonl"):
train_dataset = training_datasets.SupervisedDataset(tokenizer=tokenizer, data_path=args.data_path, args=args)
elif args.data_path.endswith(".mmap"):
train_dataset = training_datasets.MMAPSupervisedDataset(tokenizer=tokenizer, data_path=args.data_path, args=args)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
def is_master():
return dist.get_rank() == 0
class LoggingCallback(transformers.TrainerCallback):
def on_log(self, args, state, control, logs=None, **kwargs):
if logs is not None:
log_message = {
"loss": logs.get("loss", None),
"learning_rate": logs.get("learning_rate", None),
"epoch": logs.get("epoch", None),
"step": state.global_step
}
if is_master():
print(log_message)
def train():
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
args = {**model_args.__dict__, **data_args.__dict__, **training_args.__dict__}
args = argparse.Namespace(**args)
#logging.info(args)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
pad_token = '<|endoftext|>',
eos_token = '<|im_end|>', #<|endoftext|>
cache_dir = None,
model_max_length = training_args.model_max_length,
truncation = True,
padding_side = "right",
trust_remote_code = True
)
tokenizer.add_special_tokens({"additional_special_tokens": ["<|im_end|>", "<|im_start|>"]})
data_module = make_supervised_data_module(tokenizer=tokenizer, args=args)
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module, callbacks=[LoggingCallback])
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)
if __name__ == "__main__":
train()
from .multiple_metrics.containerized_eval import eval_string_script
IMPORT_HELPER = {
"python": [
"import math",
"import re",
"import sys",
"import copy",
"import datetime",
"import itertools",
"import collections",
"import heapq",
"import statistics",
"import functools",
"import hashlib",
"import numpy",
"import numpy as np",
"import string",
"from typing import *",
"from collections import *",
],
"go": [
"math",
"strings",
"fmt",
"strconv",
"time",
"bytes",
"regexp",
"sort",
"math/rand",
"crypto/md5",
],
"cpp": [
"using namespace std;",
"#include<cassert>",
"#include<stdlib.h>",
"#include<algorithm>",
"#include<cmath>",
"#include<math.h>",
"#include<numeric>",
"#include<stdio.h>",
"#include<vector>",
"#include<set>",
"#include<map>",
"#include<queue>",
"#include<stack>",
"#include<list>",
"#include<deque>",
"#include<boost/any.hpp>",
"#include<string>",
"#include<climits>",
"#include<cstring>",
"#include<iostream>",
"#include<sstream>",
"#include<fstream>",
],
"java": [
"import java.util.*;",
"import java.lang.reflect.*;",
"import org.javatuples.*;",
"import java.security.*;",
"import java.math.*;",
"import java.io.*;",
"import java.util.stream.*;",
],
"cs": [
"using System;",
"using System.Numerics;",
"using System.Diagnostics;",
"using System.Collections.Generic;",
"using System.Linq;",
"using System.Text;",
"using System.Security.Cryptography;",
"using System.Collections.Generic;",
],
}
def check_correctness_multiple(code_string, programming_language):
success = False
result = eval_string_script(programming_language, code_string)
if result["status"] == "OK":
success = True
return success
\ No newline at end of file
language_symbols = {
"python": {
"CLASS_TYPE": "class_definition",
"FUNCTION_TYPE": "function_definition",
"IMPORT_TYPE": ["import_statement", "import_from_statement"],
"IDENTIFIER_TYPE": "identifier",
"ATTRIBUTE_TYPE": "attribute",
"RETURN_TYPE": "return_statement",
"EXPRESSION_TYPE": "expression_statement",
"ASSIGNMENT_TYPE": "assignment"
},
"java": {
"CLASS_TYPE": "class_definition",
"FUNCTION_TYPE": "function_definition",
"IMPORT_TYPE": ["import_statement", "import_from_statement"],
"IDENTIFIER_TYPE": "identifier",
"ATTRIBUTE_TYPE": "attribute",
"RETURN_TYPE": "return_statement",
"EXPRESSION_TYPE": "expression_statement",
"ASSIGNMENT_TYPE": "assignment"
},
"c-sharp": {
"CLASS_TYPE": "class_definition",
"FUNCTION_TYPE": "function_definition",
"IMPORT_TYPE": ["import_statement", "import_from_statement"],
"IDENTIFIER_TYPE": "identifier",
"ATTRIBUTE_TYPE": "attribute",
"RETURN_TYPE": "return_statement",
"EXPRESSION_TYPE": "expression_statement",
"ASSIGNMENT_TYPE": "assignment"
},
"typescript": {
"CLASS_TYPE": "class_definition",
"FUNCTION_TYPE": "function_definition",
"IMPORT_TYPE": ["import_statement", "import_from_statement"],
"IDENTIFIER_TYPE": "identifier",
"ATTRIBUTE_TYPE": "attribute",
"RETURN_TYPE": "return_statement",
"EXPRESSION_TYPE": "expression_statement",
"ASSIGNMENT_TYPE": "assignment"
}
}
def guess_lang(code):
characteristics = {
"python": [
"break", "class ", "continue", "def ", "del ", "elif ", "else ",
"except", "finally", "for ", "from ", "global ", "if ", "import",
"lambda", "nonlocal", "pass", "raise",
"return", "try", "while", "with", "yield",
"def ", "import ", "as ", "lambda ", "print(", "class ", "self.", "raise ", "except", "python"
],
"c++": [
"#include <", "std::", "cout <<", "cin >>", "namespace ", "NULL", "std::vector<", "std::string", "template<", "::"
],
"java": [
"public class ", "public static void main", "System.out.println", "import java.", "extends ", "implements ", "new ", "throws ", "// "
],
"php": [
"<?php", "echo ", "<?= ", "$", "public function ", "array(", "class ", ";", "?>"
],
"typescript": [
"interface ", "let ", ": number", "=>", "enum ", "type ", "public ", "private ", "protected ", "import "
],
"r": [
"<-", "library(", "data.frame(", "ggplot(", "plot(", "function(", " <-", " c(", "list("
],
"swift": [
"let ", "var ", "func ", "import SwiftUI", "struct ", "enum ", "class ", "override ", "extension ", "self."
],
"rust": [
"fn ", "let ", "mut ", "use std::", "impl ", "#[derive(", "match ", "pub struct ", "mod ", "extern crate "
],
"go": [
"package ", "import ", "func ", "var ", "const ", "type ", "chan ", "defer ", "go func", "map["
],
"C#": [
"using System;", "static void Main", "Console.WriteLine", "public class ", "namespace ", "get; set;", "[", "/*"
],
"Bash": [
"#!/bin/bash", "echo ", "grep ", "function ", "if ", "then ", "fi", "do", "done", "case in", "export ", "`"
],
"jupyter": [
"%matplotlib inline", "import pandas as pd", "# In[", "plt.plot(", "pd.DataFrame(", "!pip install ", "%load_ext"
]
}
for language, signs in characteristics.items():
code_tokens = code.split()
key_words = set(signs) & set(code_tokens)
if len(key_words) > 0:
return language, ", ".join(key_words)
return "unknown", None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment