Commit af238596 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2392 failed with stages
in 0 seconds
# Copyright (c) 2024 westlake-repl
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# SPDX-License-Identifier: MIT
# This file has been modified by Junyi Chen.
#
# Original file was released under MIT, with the full license text
# available at https://choosealicense.com/licenses/mit/.
#
# This modified file is released under the same license.
general_arguments = [
'seed',
'reproducibility',
'state',
'model',
'data_path',
'checkpoint_dir',
'show_progress',
'config_file',
'log_wandb',
'use_text',
'strategy',
'precision'
]
training_arguments = [
'epochs', 'train_batch_size',
'optim_args',
'eval_step', 'stopping_step',
'clip_grad_norm',
'loss_decimal_place',
]
evaluation_arguments = [
'eval_type',
'repeatable',
'metrics', 'topk', 'valid_metric', 'valid_metric_bigger',
'eval_batch_size',
'metric_decimal_place',
]
dataset_arguments = [
'MAX_TEXT_LENGTH',
'MAX_ITEM_LIST_LENGTH',
'MAX_ITEM_LIST_LENGTH_TEST',
'num_negatives',
'text_keys',
'item_prompt',
]
# Copyright (c) 2024 westlake-repl
# SPDX-License-Identifier: MIT
from enum import Enum
class InputType(Enum):
SEQ = 1
PAIR = 2
AUGSEQ = 3
class EvaluatorType(Enum):
"""Type for evaluation metrics.
- ``RANKING``: Ranking-based metrics like NDCG, Recall, etc.
- ``VALUE``: Value-based metrics like AUC, etc.
"""
RANKING = 1
VALUE = 2
# Copyright (c) 2024 westlake-repl
# SPDX-License-Identifier: MIT
import logging
import os
import sys
import colorlog
import re
import torch
from REC.utils.utils import get_local_time, ensure_dir
from colorama import init
log_colors_config = {
'DEBUG': 'cyan',
'WARNING': 'yellow',
'ERROR': 'red',
'CRITICAL': 'red',
}
class RemoveColorFilter(logging.Filter):
def filter(self, record):
if record:
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
record.msg = ansi_escape.sub('', str(record.msg))
return True
def set_color(log, color, highlight=True):
color_set = ['black', 'red', 'green', 'yellow', 'blue', 'pink', 'cyan', 'white']
try:
index = color_set.index(color)
except:
index = len(color_set) - 1
prev_log = '\033['
if highlight:
prev_log += '1;3'
else:
prev_log += '0;3'
prev_log += str(index) + 'm'
return prev_log + log + '\033[0m'
def init_logger(config):
"""
A logger that can show a message on standard output and write it into the
file named `filename` simultaneously.
All the message that you want to log MUST be str.
Args:
config (Config): An instance object of Config, used to record parameter information.
Example:
>>> logger = logging.getLogger(config)
>>> logger.debug(train_state)
>>> logger.info(train_result)
"""
init(autoreset=True)
LOGROOT = config['checkpoint_dir'] + '/' if config['checkpoint_dir'] else './log/'
dir_name = os.path.dirname(LOGROOT)
rank = torch.distributed.get_rank()
if rank == 0:
ensure_dir(dir_name)
model_name = os.path.join(dir_name, config['model'])
ensure_dir(model_name)
torch.distributed.barrier()
logfilename = '{}/{}.log'.format(config['model'], get_local_time())
logfilepath = os.path.join(LOGROOT, logfilename)
if config['log_path']:
logfilepath = os.path.join(LOGROOT, config['log_path'])
filefmt = "%(asctime)-15s %(levelname)s %(message)s"
filedatefmt = "%a %d %b %Y %H:%M:%S"
fileformatter = logging.Formatter(filefmt, filedatefmt)
sfmt = "%(log_color)s%(asctime)-15s %(levelname)s %(message)s"
sdatefmt = "%d %b %H:%M"
sformatter = colorlog.ColoredFormatter(sfmt, sdatefmt, log_colors=log_colors_config)
if config['state'] is None or config['state'].lower() == 'info':
level = logging.INFO
elif config['state'].lower() == 'debug':
level = logging.DEBUG
elif config['state'].lower() == 'error':
level = logging.ERROR
elif config['state'].lower() == 'warning':
level = logging.WARNING
elif config['state'].lower() == 'critical':
level = logging.CRITICAL
else:
level = logging.INFO
fh = logging.FileHandler(logfilepath)
fh.setLevel(level)
fh.setFormatter(fileformatter)
remove_color_filter = RemoveColorFilter()
fh.addFilter(remove_color_filter)
sh = logging.StreamHandler(sys.stdout)
sh.setLevel(level)
sh.setFormatter(sformatter)
logging.basicConfig(level=level if rank in [-1, 0] else logging.WARN, handlers=[sh, fh])
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
import math
from bisect import bisect_right
import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
"""
Create a schedule with a constant learning rate, using the learning rate set in optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
def get_constant_schedule_with_warmup(
optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1
):
"""
Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
increases linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
def lr_lambda(current_step: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1.0, num_warmup_steps))
return 1.0
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
def get_linear_schedule_with_warmup(
optimizer, num_warmup_steps, num_training_steps, last_epoch=-1, lr_end=1e-7
):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
num_training_steps (:obj:`int`):
The total number of training steps.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
def lr_lambda(current_step: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return max(
lr_end,
float(num_training_steps - current_step)
/ float(max(1, num_training_steps - num_warmup_steps)),
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def get_cosine_schedule_with_warmup(
optimizer: Optimizer,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float = 0.5,
last_epoch: int = -1,
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
num_training_steps (:obj:`int`):
The total number of training steps.
num_cycles (:obj:`float`, `optional`, defaults to 0.5):
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
following a half-cosine).
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(
max(1, num_training_steps - num_warmup_steps)
)
return max(
0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def get_cosine_with_hard_restarts_schedule_with_warmup(
optimizer: Optimizer,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: int = 1,
last_epoch: int = -1,
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
num_training_steps (:obj:`int`):
The total number of training steps.
num_cycles (:obj:`int`, `optional`, defaults to 1):
The number of hard restarts to use.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(
max(1, num_training_steps - num_warmup_steps)
)
if progress >= 1.0:
return 0.0
return max(
0.0,
0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))),
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def get_polynomial_decay_schedule_with_warmup(
optimizer,
num_warmup_steps,
num_training_steps,
lr_end=1e-7,
power=1.0,
last_epoch=-1,
):
"""
Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
optimizer to end lr defined by `lr_end`, after a warmup period during which it increases linearly from 0 to the
initial lr set in the optimizer.
Args:
optimizer (:class:`~torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (:obj:`int`):
The number of steps for the warmup phase.
num_training_steps (:obj:`int`):
The total number of training steps.
lr_end (:obj:`float`, `optional`, defaults to 1e-7):
The end LR.
power (:obj:`float`, `optional`, defaults to 1.0):
Power factor.
last_epoch (:obj:`int`, `optional`, defaults to -1):
The index of the last epoch when resuming training.
Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
implementation at
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
Return:
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_init = optimizer.defaults["lr"]
assert (
lr_init > lr_end
), f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})"
def lr_lambda(current_step: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
elif current_step > num_training_steps:
return lr_end / lr_init # as LambdaLR multiplies by lr_init
else:
lr_range = lr_init - lr_end
decay_steps = num_training_steps - num_warmup_steps
pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
decay = lr_range * pct_remaining**power + lr_end
return decay / lr_init # as LambdaLR multiplies by lr_init
return LambdaLR(optimizer, lr_lambda, last_epoch)
# FIXME ideally this would be achieved with a CombinedLRScheduler,
# separating MultiStepLR with WarmupLR
# but the current LRScheduler design doesn't allow it
class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(
self,
optimizer,
milestones,
gamma=0.1,
warmup_factor=1.0 / 3,
warmup_iters=500,
warmup_method="linear",
last_epoch=-1,
):
if not list(milestones) == sorted(milestones):
raise ValueError(
"Milestones should be a list of" " increasing integers. Got {}",
milestones,
)
if warmup_method not in ("constant", "linear"):
raise ValueError(
"Only 'constant' or 'linear' warmup_method accepted"
"got {}".format(warmup_method)
)
self.milestones = milestones
self.gamma = gamma
self.warmup_factor = warmup_factor
self.warmup_iters = warmup_iters
self.warmup_method = warmup_method
super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
warmup_factor = 1
if self.last_epoch < self.warmup_iters:
if self.warmup_method == "constant":
warmup_factor = self.warmup_factor
elif self.warmup_method == "linear":
alpha = self.last_epoch / self.warmup_iters
warmup_factor = self.warmup_factor * (1 - alpha) + alpha
return [
base_lr
* warmup_factor
* self.gamma ** bisect_right(self.milestones, self.last_epoch)
for base_lr in self.base_lrs
]
# Copyright (c) 2024 westlake-repl
# SPDX-License-Identifier: MIT
import datetime
import importlib
import os
import random
import numpy as np
import torch
from tensorboardX import SummaryWriter
def get_local_time():
r"""Get current time
Returns:
str: current time
"""
torch.distributed.barrier()
cur = datetime.datetime.now()
cur = cur.strftime('%b-%d-%Y_%H-%M-%S')
return cur
def ensure_dir(dir_path):
r"""Make sure the directory exists, if it does not exist, create it
Args:
dir_path (str): directory path
"""
if not os.path.exists(dir_path):
os.makedirs(dir_path)
def get_model(model_name):
model_file_name = model_name.lower()
model_module = None
module_path = '.'.join(['REC.model.IDNet', model_file_name])
if importlib.util.find_spec(module_path, __name__):
model_module = importlib.import_module(module_path, __name__)
if model_module is None:
module_path = '.'.join(['REC.model.HLLM', model_file_name])
if importlib.util.find_spec(module_path, __name__):
model_module = importlib.import_module(module_path, __name__)
if model_module is None:
raise ValueError('`model_name` [{}] is not the name of an existing model.'.format(model_name))
model_class = getattr(model_module, model_name)
return model_class
def early_stopping(value, best, cur_step, max_step, bigger=True):
r""" validation-based early stopping
Args:
value (float): current result
best (float): best result
cur_step (int): the number of consecutive steps that did not exceed the best result
max_step (int): threshold steps for stopping
bigger (bool, optional): whether the bigger the better
Returns:
tuple:
- float,
best result after this step
- int,
the number of consecutive steps that did not exceed the best result after this step
- bool,
whether to stop
- bool,
whether to update
"""
stop_flag = False
update_flag = False
if bigger:
if value >= best:
cur_step = 0
best = value
update_flag = True
else:
cur_step += 1
if cur_step > max_step:
stop_flag = True
else:
if value <= best:
cur_step = 0
best = value
update_flag = True
else:
cur_step += 1
if cur_step > max_step:
stop_flag = True
return best, cur_step, stop_flag, update_flag
def calculate_valid_score(valid_result, valid_metric=None):
r""" return valid score from valid result
Args:
valid_result (dict): valid result
valid_metric (str, optional): the selected metric in valid result for valid score
Returns:
float: valid score
"""
if valid_metric:
return valid_result[valid_metric]
else:
return valid_result['Recall@10']
def dict2str(result_dict):
r""" convert result dict to str
Args:
result_dict (dict): result dict
Returns:
str: result str
"""
return ' '.join([str(metric) + ' : ' + str(value) for metric, value in result_dict.items()])
def init_seed(seed, reproducibility):
r""" init random seed for random functions in numpy, torch, cuda and cudnn
Args:
seed (int): random seed
reproducibility (bool): Whether to require reproducibility
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if reproducibility:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
else:
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
def get_tensorboard(logger):
r""" Creates a SummaryWriter of Tensorboard that can log PyTorch models and metrics into a directory for
visualization within the TensorBoard UI.
For the convenience of the user, the naming rule of the SummaryWriter's log_dir is the same as the logger.
Args:
logger: its output filename is used to name the SummaryWriter's log_dir.
If the filename is not available, we will name the log_dir according to the current time.
Returns:
SummaryWriter: it will write out events and summaries to the event file.
"""
base_path = 'log_tensorboard'
dir_name = None
for handler in logger.handlers:
if hasattr(handler, "baseFilename"):
dir_name = os.path.basename(getattr(handler, 'baseFilename')).split('.')[0]
break
if dir_name is None:
dir_name = '{}-{}'.format('model', get_local_time())
dir_path = os.path.join(base_path, dir_name)
writer = SummaryWriter(dir_path)
return writer
def get_gpu_usage(device=None):
r""" Return the reserved memory and total memory of given device in a string.
Args:
device: cuda.device. It is the device that the model run on.
Returns:
str: it contains the info about reserved memory and total memory of given device.
"""
reserved = torch.cuda.max_memory_reserved(device) / 1024 ** 3
total = torch.cuda.get_device_properties(device).total_memory / 1024 ** 3
return '{:.2f} G/{:.2f} G'.format(reserved, total)
# Copyright (c) 2024 westlake-repl
# SPDX-License-Identifier: MIT
class WandbLogger(object):
"""WandbLogger to log metrics to Weights and Biases.
"""
def __init__(self, config):
"""
Args:
config (dict): A dictionary of parameters used by RecBole.
"""
self.config = config
self.log_wandb = config.log_wandb
self.setup()
def setup(self):
if self.log_wandb:
try:
import wandb
self._wandb = wandb
except ImportError:
raise ImportError(
"To use the Weights and Biases Logger please install wandb."
"Run `pip install wandb` to install it."
)
# Initialize a W&B run
if self._wandb.run is None:
self._wandb.init(
project=self.config.wandb_project,
config=self.config
)
self._set_steps()
def log_metrics(self, metrics, head='train', commit=True):
if self.log_wandb:
if head:
metrics = self._add_head_to_metrics(metrics, head)
self._wandb.log(metrics, commit=commit)
else:
self._wandb.log(metrics, commit=commit)
def log_eval_metrics(self, metrics, head='eval'):
if self.log_wandb:
metrics = self._add_head_to_metrics(metrics, head)
for k, v in metrics.items():
self._wandb.run.summary[k] = v
def _set_steps(self):
self._wandb.define_metric('train/*', step_metric='train_step')
self._wandb.define_metric('valid/*', step_metric='valid_step')
def _add_head_to_metrics(self, metrics, head):
head_metrics = dict()
for k, v in metrics.items():
if '_step' in k:
head_metrics[k] = v
else:
head_metrics[f'{head}/{k}'] = v
return head_metrics
---
license: apache-2.0
datasets:
- cerebras/SlimPajama-627B
- bigcode/starcoderdata
- HuggingFaceH4/ultrachat_200k
- HuggingFaceH4/ultrafeedback_binarized
language:
- en
widget:
- example_title: Fibonacci (Python)
messages:
- role: system
content: You are a chatbot who can help code!
- role: user
content: Write me a function to calculate the first 10 digits of the fibonacci sequence in Python and print it out to the CLI.
---
<div align="center">
# TinyLlama-1.1B
</div>
https://github.com/jzhang38/TinyLlama
The TinyLlama project aims to **pretrain** a **1.1B Llama model on 3 trillion tokens**. With some proper optimization, we can achieve this within a span of "just" 90 days using 16 A100-40G GPUs 🚀🚀. The training has started on 2023-09-01.
We adopted exactly the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint.
#### This Model
This is the chat model finetuned on top of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T). **We follow [HF's Zephyr](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)'s training recipe.** The model was " initially fine-tuned on a variant of the [`UltraChat`](https://huggingface.co/datasets/stingning/ultrachat) dataset, which contains a diverse range of synthetic dialogues generated by ChatGPT.
We then further aligned the model with [🤗 TRL's](https://github.com/huggingface/trl) `DPOTrainer` on the [openbmb/UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset, which contain 64k prompts and model completions that are ranked by GPT-4."
#### How to use
You will need the transformers>=4.34
Do check the [TinyLlama](https://github.com/jzhang38/TinyLlama) github page for more information.
```python
# Install transformers from source - only needed for versions <= v4.34
# pip install git+https://github.com/huggingface/transformers.git
# pip install accelerate
import torch
from transformers import pipeline
pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")
# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
{
"role": "system",
"content": "You are a friendly chatbot who always responds in the style of a pirate",
},
{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])
# <|system|>
# You are a friendly chatbot who always responds in the style of a pirate.</s>
# <|user|>
# How many helicopters can a human eat in one sitting?</s>
# <|assistant|>
# ...
```
\ No newline at end of file
# Copyright (c) 2024 westlake-repl
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# SPDX-License-Identifier: MIT
# This file has been modified by Junyi Chen.
#
# Original file was released under MIT, with the full license text
# available at https://choosealicense.com/licenses/mit/.
#
# This modified file is released under the same license.
import os
import argparse
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# os.environ["OMP_NUM_THREADS"] = '1'
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config_file", nargs='+')
args, unknown_args = parser.parse_known_args()
config_file = args.config_file
if len(config_file) == 2:
run_yaml = f"../TORCHRUN run.py --config_file {config_file[0]} {config_file[1]} {' '.join(unknown_args)}"
elif len(config_file) == 1:
run_yaml = f"../TORCHRUN run.py --config_file {config_file[0]} {' '.join(unknown_args)}"
os.system(run_yaml)
# general
seed: 2020
state: INFO
reproducibility: True
checkpoint_dir: 'saved'
show_progress: False
log_wandb: False
wandb_project: 'REC'
MAX_ITEM_LIST_LENGTH: 10
data_path: ../dataset/ # dataset path
dataset: Pixel200K # dataset name
loss: nce
# training settings
epochs: 200
train_batch_size: 64
optim_args: {
learning_rate: 0.0001,
weight_decay: 0.1
}
# eval settings
eval_batch_size: 1024
topk: [5,10,50,200]
metrics: ['Recall', 'NDCG']
valid_metric: NDCG@200
metric_decimal_place: 7
eval_step: 1
stopping_step: 10
# general
seed: 2020
state: INFO
reproducibility: True
checkpoint_dir: 'saved'
show_progress: False
log_wandb: False
wandb_project: 'REC'
MAX_ITEM_LIST_LENGTH: 10
data_path: ../dataset/ # dataset path
dataset: Pixel200K # dataset name
loss: nce
# training settings
epochs: 200
train_batch_size: 64
optim_args: {
learning_rate: 0.0001,
weight_decay: 0.1
}
# eval settings
eval_batch_size: 1024
topk: [5,10,50,200]
metrics: ['Recall', 'NDCG']
valid_metric: NDCG@200
metric_decimal_place: 7
eval_step: 1
stopping_step: 10
strategy: deepspeed
precision: bf16-mixed
stage: 2
\ No newline at end of file
# general
seed: 2020
state: INFO
use_text: True
reproducibility: True
checkpoint_dir: 'saved'
show_progress: True
log_wandb: False
wandb_project: 'REC'
MAX_ITEM_LIST_LENGTH: 50
MAX_TEXT_LENGTH: 64
data_path: ../dataset/ # dataset path
dataset: Pixel200K # dataset name
test_dataset: Pixel200K # dataset name
text_path: text_path # Use absolute path
text_keys: ['title', 'tag', 'description']
item_prompt: 'Compress the following sentence into embedding: '
item_emb_token_n: 1
# training settings
epochs: 10
train_batch_size: 8
optim_args: {
learning_rate: 1e-4,
weight_decay: 0.01
}
scheduler_args: {
type: cosine,
warmup: 0.1
}
# eval settings
eval_batch_size: 256
topk: [5,10,50,200]
metrics: ['Recall', 'NDCG']
valid_metric: NDCG@200
metric_decimal_place: 7
eval_step: 1
stopping_step: 5
strategy: ddp
precision: bf16-mixed
\ No newline at end of file
# general
seed: 2020
state: INFO
use_text: True
reproducibility: True
checkpoint_dir: 'saved'
show_progress: True
log_wandb: False
wandb_project: 'REC'
MAX_ITEM_LIST_LENGTH: 50
MAX_TEXT_LENGTH: 64
data_path: ../dataset/ # dataset path
dataset: Pixel200K # dataset name
text_path: text_path # Use absolute path
text_keys: ['title', 'tag', 'description']
item_prompt: 'Compress the following sentence into embedding: '
item_emb_token_n: 1
loss: nce
# training settings
epochs: 10
train_batch_size: 8
optim_args: {
learning_rate: 1e-4,
weight_decay: 0.01
}
scheduler_args: {
type: cosine,
warmup: 0.1
}
# eval settings
eval_batch_size: 8
topk: [5,10,50,200]
metrics: ['Recall', 'NDCG']
valid_metric: NDCG@200
metric_decimal_place: 7
eval_step: 1
stopping_step: 5
strategy: deepspeed
precision: bf16-mixed
stage: 2
# cd saved_path/HLLM-0.pth
python saved_path/HLLM-0.pth/zero_to_fp32.py saved_path/HLLM-0.pth saved_path/pytorch_model.bin
# Copyright (c) 2024 westlake-repl
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# SPDX-License-Identifier: MIT
# This file has been modified by Junyi Chen.
#
# Original file was released under MIT, with the full license text
# available at https://choosealicense.com/licenses/mit/.
#
# This modified file is released under the same license.
from cProfile import run
from logging import getLogger
import torch
import json
from REC.data import *
from REC.config import Config
from REC.utils import init_logger, get_model, init_seed, set_color
from REC.trainer import Trainer
import torch.distributed as dist
import os
import numpy as np
import argparse
import torch.distributed as dist
import torch
def convert_str(s):
try:
if s.lower() == 'none':
return None
if s.lower() == 'true':
return True
if s.lower() == 'false':
return False
float_val = float(s)
if float_val.is_integer():
return int(float_val)
return float_val
except ValueError:
print(f"Unable to convert the string '{s}' to None / Bool / Float / Int, retaining the original string.")
return s
def run_loop(local_rank, config_file=None, saved=True, extra_args=[]):
# configurations initialization
config = Config(config_file_list=config_file)
device = torch.device("cuda", local_rank)
config['device'] = device
if len(extra_args):
for i in range(0, len(extra_args), 2):
key = extra_args[i][2:]
value = extra_args[i + 1]
try:
if '[' in value or '{' in value:
value = json.loads(value)
print("json:", value)
if isinstance(value, dict):
for k, v in value.items():
value[k] = convert_str(v)
else:
value = [convert_str(x) for x in value]
else:
value = convert_str(value)
if '.' in key:
k1, k2 = key.split('.')
config[k1][k2] = value
else:
config[key] = value
except:
raise ValueError(f"{key} {value} invalid")
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger = getLogger()
if 'text_path' in config:
config['text_path'] = os.path.join(config['text_path'], config['dataset'] + '.csv')
logger.info(f"Update text_path to {config['text_path']}")
# get model and data
dataload = load_data(config)
train_loader, valid_loader, test_loader = bulid_dataloader(config, dataload)
print(f"{len(train_loader) = }")
print(f"{len(valid_loader) = }")
model = get_model(config['model'])(config, dataload)
# model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
world_size = torch.distributed.get_world_size()
trainer = Trainer(config, model)
logger.info(set_color('\nWorld_Size', 'pink') + f' = {world_size} \n')
logger.info(config)
logger.info(dataload)
logger.info(model)
if config['val_only']:
ckpt_path = os.path.join(config['checkpoint_dir'], 'pytorch_model.bin')
ckpt = torch.load(ckpt_path, map_location='cpu')
logger.info(f'Eval only model load from {ckpt_path}')
msg = trainer.model.load_state_dict(ckpt, False)
logger.info(f'{msg.unexpected_keys = }')
logger.info(f'{msg.missing_keys = }')
test_result = trainer.evaluate(test_loader, load_best_model=False, show_progress=config['show_progress'], init_model=True)
logger.info(set_color('test result', 'yellow') + f': {test_result}')
else:
# training process
best_valid_score, best_valid_result = trainer.fit(
train_loader, valid_loader, saved=saved, show_progress=config['show_progress']
)
logger.info(f'Trianing Ended' + set_color('best valid ', 'yellow') + f': {best_valid_result}')
# model evaluation
test_result = trainer.evaluate(test_loader, load_best_model=saved, show_progress=config['show_progress'])
logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
logger.info(set_color('test result', 'yellow') + f': {test_result}')
return {
'best_valid_score': best_valid_score,
'valid_score_bigger': config['valid_metric_bigger'],
'best_valid_result': best_valid_result,
'test_result': test_result
}
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config_file", nargs='+', type=str)
args, extra_args = parser.parse_known_args()
local_rank = int(os.environ['LOCAL_RANK'])
config_file = args.config_file
torch.cuda.set_device(local_rank)
dist.init_process_group(backend='nccl')
run_loop(local_rank=local_rank, config_file=config_file, extra_args=extra_args)
Please store interaction files in this path. Each file should look like:
| item_id | user_id | timestamp |
|---------|---------|-----------|
| item_i | user_j | time_k |
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
ENV DEBIAN_FRONTEND=noninteractive
# RUN yum update && yum install -y git cmake wget build-essential
# RUN source /opt/dtk-24.04.3/env.sh
# # 安装pip相关依赖
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
setuptools==69.5.1
pandas==2.2.2
colorama==0.4.6
torch_geometric==2.5.3
accelerate==0.30.1
numpy==1.26.4
wandb==0.17.7
colorlog==6.8.2
# deepspeed==0.14.2
lightning==2.4.0
pytz==2020.5
PyYAML==6.0.1
tensorboardX==2.6.2.2
tensorflow_cpu==2.8.1
tqdm==4.66.4
transformers==4.41.1
fbgemm_gpu==0.5.0
# flash_attn==2.5.9.post1
sentencepiece==0.2.0
cd code && python3 main.py \
--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
--loss nce \
--epochs 5 \
--dataset Pixel200K \
--train_batch_size 8 \
--MAX_TEXT_LENGTH 256 \
--MAX_ITEM_LIST_LENGTH 10 \
--checkpoint_dir saved_path \
--optim_args.learning_rate 1e-4 \
--item_pretrain_dir TinyLlama-1.1B-Chat-v1.0 \
--user_pretrain_dir TinyLlama-1.1B-Chat-v1.0 \
--text_path "../information" \
--text_keys '[\"title\",\"tag\",\"description\"]' \
--val_only True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment