Commit af238596 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2392 failed with stages
in 0 seconds
Please store raw item information files in this path. Each file should look like:
| item_id | description | title | tag (optional) |
|---------|---------------|---------|----------------|
| item_i | description_i | title_i | tag_i |
\ No newline at end of file
# 模型编码
modelCode=1409
# 模型名称
modelName=HLLM_pytorch
# 模型描述
modelDescription=字节提出的双层LLM模型HLLM仅需相当于基于传统ID方法1/6至1/4的数据量即可达到同等性能水平,较SOTA模型性能提升0.705%。
# 应用场景
appScenario=训练,推荐系统,零售,广媒,金融,通信
# 框架类型
frameType=pytorch
# Copyright (c) Meta Platforms, Inc. and affiliates.
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
def preprocess_interaction(intercation_path, output_path, prefix='books'):
ratings = pd.read_csv(
intercation_path,
sep=",",
names=["user_id", "item_id", "rating", "timestamp"],
)
print(f"{prefix} #data points before filter: {ratings.shape[0]}")
print(
f"{prefix} #user before filter: {len(set(ratings['user_id'].values))}"
)
print(
f"{prefix} #item before filter: {len(set(ratings['item_id'].values))}"
)
# filter users and items with presence < 5
item_id_count = (
ratings["item_id"]
.value_counts()
.rename_axis("unique_values")
.reset_index(name="item_count")
)
user_id_count = (
ratings["user_id"]
.value_counts()
.rename_axis("unique_values")
.reset_index(name="user_count")
)
ratings = ratings.join(item_id_count.set_index("unique_values"), on="item_id")
ratings = ratings.join(user_id_count.set_index("unique_values"), on="user_id")
ratings = ratings[ratings["item_count"] >= 5]
ratings = ratings[ratings["user_count"] >= 5]
ratings = ratings.groupby('user_id').filter(lambda x: len(x['item_id']) >= 5)
print(f"{prefix} #data points after filter: {ratings.shape[0]}")
print(
f"{prefix} #user after filter: {len(set(ratings['user_id'].values))}"
)
print(
f"{prefix} #item ater filter: {len(set(ratings['item_id'].values))}"
)
ratings = ratings[['item_id', 'user_id', 'timestamp']]
ratings.to_csv(output_path, index=False, header=True)
def preprocess_item(item_path, output_path, prefix='books'):
data = []
for line in open(item_path):
json_data = eval(line)
item_id = json_data.get('asin', '')
description = json_data.get('description', '')
title = json_data.get('title', '')
data.append({
'item_id': item_id,
'description': description,
'title': title
})
df = pd.DataFrame(data)
df.to_csv(output_path, index=False)
if __name__ == '__main__':
preprocess_interaction("ratings_Books.csv", "amazon_books.csv")
preprocess_item("meta_Books.json", "amazon_books_item.csv")
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# 1B: 128 A100s for ≈ 0.5days
# 7B: 128 A100s for ≈ 2days
# For Books, training with a sequence length of 55 and 56*512=28672 negatives in total, serving in a sequence length of 50.
cd code && python3 main.py \
--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
--MAX_ITEM_LIST_LENGTH 55 \
--epochs 5 \
--optim_args.learning_rate 1e-4 \
--checkpoint_dir saved_dir \
--loss nce \
--MAX_TEXT_LENGTH 256 \
--scheduler_args.warmup 0.15 \
--dataset amazon_books \
--gradient_checkpointing True \
--text_keys '[\"title\",\"description\"]' \
--text_path text_path \
--item_pretrain_dir item_pretrain_dir \
--user_pretrain_dir user_pretrain_dir \
--train_batch_size 4 \
--data_split False \
--MAX_ITEM_LIST_LENGTH_TEST 50 \
--seed 42 \
--stage 3
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# 1B: 16 H100s for ≈ 2days
cd code && python3 main.py \
--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
--MAX_ITEM_LIST_LENGTH 50 \
--epochs 5 \
--optim_args.learning_rate 1e-4 \
--checkpoint_dir saved_dir \
--loss nce \
--MAX_TEXT_LENGTH 256 \
--dataset amazon_books \
--gradient_checkpointing True \
--text_keys '[\"title\",\"description\"]' \
--train_batch_size 8 \
--text_path text_path \
--item_pretrain_dir item_pretrain_dir \
--user_pretrain_dir user_pretrain_dir \
--num_negatives 512
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# 1B: 32 A100s for ≈ 4.1days
cd code && python3 main.py \
--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
--MAX_ITEM_LIST_LENGTH 10 --epochs 5 --optim_args.learning_rate 1e-4 \
--checkpoint_dir saved_dir \
--loss nce --MAX_TEXT_LENGTH 256 --dataset Pixel8M \
--text_path text_path \
--item_pretrain_dir item_pretrain_dir \
--user_pretrain_dir user_pretrain_dir \
--train_batch_size 16
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# batch_size = 16GPUs * 8 = 128
cd code && python3 main.py \
--config_file IDNet/hstu.yaml overall/ID_deepspeed.yaml \
--optim_args.learning_rate 1e-3 \
--loss nce \
--train_batch_size 8 \
--MAX_ITEM_LIST_LENGTH 50 \
--epochs 201 \
--dataset amazon_books \
--hidden_dropout_prob 0.5 \
--attn_dropout_prob 0.5 \
--n_layers 22 \
--n_heads 32 \
--item_embedding_size 2048 \
--hstu_embedding_size 2048 \
--fix_temp True \
--num_negatives 512 \
--show_progress True \
--update_interval 100 \
--checkpoint_dir checkpoint_dir \
--stopping_step 10
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# Use 8GPUs for batch_size = 8x64 = 512
cd code && python3 main.py \
--config_file IDNet/hstu.yaml overall/ID.yaml \
--optim_args.learning_rate 1e-4 \
--loss nce \
--train_batch_size 64 \
--MAX_ITEM_LIST_LENGTH 10 \
--epochs 50 \
--dataset Pixel8M \
--stopping_step 5 \
--show_progress True \
--update_interval 100
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# Use 8GPUs for batch_size = 8x64 = 512
# fixed temperature performs better
cd code && python3 main.py \
--config_file IDNet/hstu.yaml overall/ID.yaml \
--optim_args.learning_rate 1e-3 \
--loss nce \
--train_batch_size 16 \
--MAX_ITEM_LIST_LENGTH 55 \
--epochs 201 \
--dataset amazon_books \
--hidden_dropout_prob 0.5 \
--attn_dropout_prob 0.5 \
--n_layers 16 \
--n_heads 8 \
--item_embedding_size 64 \
--hstu_embedding_size 64 \
--fix_temp True \
--data_split False \
--show_progress True \
--update_interval 100 \
--optim_args.weight_decay 0.0 \
--seed 42 \
--stopping_step 10
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# Use 8GPUs for batch_size = 8x16 = 128
cd code && python3 main.py \
--config_file IDNet/hstu.yaml overall/ID.yaml \
--optim_args.learning_rate 1e-3 \
--loss nce \
--train_batch_size 16 \
--MAX_ITEM_LIST_LENGTH 50 \
--epochs 201 \
--dataset amazon_books \
--hidden_dropout_prob 0.5 \
--attn_dropout_prob 0.5 \
--n_layers 16 \
--n_heads 8 \
--item_embedding_size 64 \
--hstu_embedding_size 64 \
--num_negatives 512 \
--show_progress True \
--update_interval 100 \
--optim_args.weight_decay 0.0 \
--stopping_step 10
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# batch_size = 16GPUs * 8 = 128
# flash attn need bf16
cd code && python3 main.py \
--config_file IDNet/llama_id.yaml overall/ID_deepspeed.yaml \
--optim_args.learning_rate 1e-3 \
--loss nce \
--train_batch_size 8 \
--MAX_ITEM_LIST_LENGTH 50 \
--epochs 201 \
--dataset amazon_books \
--num_negatives 512 \
--item_embed_dim 512 \
--show_progress True \
--update_interval 100 \
--fix_temp True \
--optim_args.weight_decay 0.1 \
--user_pretrain_dir user_pretrain_dir \
--checkpoint_dir checkpoint_dir \
--stopping_step 10
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# Use 8GPUs for batch_size = 8x16 = 128
cd code && python3 main.py \
--config_file IDNet/sasrec.yaml overall/ID.yaml \
--optim_args.learning_rate 1e-3 \
--loss nce \
--train_batch_size 16 \
--MAX_ITEM_LIST_LENGTH 50 \
--epochs 201 \
--dataset amazon_books \
--hidden_dropout_prob 0.5 \
--attn_dropout_prob 0.5 \
--num_negatives 512 \
--n_layers 4 \
--n_heads 4 \
--embedding_size 64 \
--inner_size 1 \
--show_progress True \
--update_interval 100 \
--optim_args.weight_decay 0.0 \
--stopping_step 10
\ No newline at end of file
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# Use 8GPUs for batch_size = 8x64 = 512
cd code && python3 main.py \
--config_file IDNet/sasrec.yaml overall/ID.yaml \
--optim_args.learning_rate 1e-4 \
--loss nce \
--train_batch_size 64 \
--MAX_ITEM_LIST_LENGTH 10 \
--epochs 50 \
--dataset Pixel8M \
--stopping_step 5 \
--show_progress True \
--update_interval 100
\ No newline at end of file
setuptools==69.5.1
pandas==2.2.2
colorama==0.4.6
torch_geometric==2.5.3
accelerate==0.30.1
numpy==1.26.4
wandb==0.17.7
colorlog==6.8.2
# deepspeed==0.14.2
lightning==2.4.0
pytz==2020.5
PyYAML==6.0.1
tensorboardX==2.6.2.2
tensorflow_cpu==2.8.1
tqdm==4.66.4
transformers==4.41.1
fbgemm_gpu==0.5.0
# flash_attn==2.5.9.post1
sentencepiece==0.2.0
#!/bin/bash
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# 1B: 128 A100s for ≈ 0.5days
# 7B: 128 A100s for ≈ 2days
# For Books, training with a sequence length of 55 and 56*512=28672 negatives in total, serving in a sequence length of 50.
cd code && python3 main.py \
--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
--loss nce \
--epochs 5 \
--dataset Pixel200K \
--train_batch_size 8 \
--MAX_TEXT_LENGTH 256 \
--MAX_ITEM_LIST_LENGTH 10 \
--checkpoint_dir saved_path \
--optim_args.learning_rate 1e-4 \
--item_pretrain_dir TinyLlama-1.1B-Chat-v1.0 \
--user_pretrain_dir TinyLlama-1.1B-Chat-v1.0 \
--text_path "../information" \
--text_keys '[\"title\",\"tag\",\"description\"]'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment