v1.0

af238596 · chenzk · af238596 · af238596 · af238596 · af238596
Commit af238596 authored Feb 13, 2025 by chenzk
15 changed files
--- a/information/readme.md
+++ b/information/readme.md
+Please store raw item information files in this path. Each file should look like:
+
+| item_id | description   | title   | tag (optional) |
+|---------|---------------|---------|----------------|
+| item_i  | description_i | title_i | tag_i          |
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1409
+# 模型名称
+modelName=HLLM_pytorch
+# 模型描述
+modelDescription=字节提出的双层LLM模型HLLM仅需相当于基于传统ID方法1/6至1/4的数据量即可达到同等性能水平，较SOTA模型性能提升0.705%。
+# 应用场景
+appScenario=训练,推荐系统,零售,广媒,金融,通信
+# 框架类型
+frameType=pytorch
--- a/process_books.py
+++ b/process_books.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+
+def preprocess_interaction(intercation_path, output_path, prefix='books'):
+    ratings = pd.read_csv(
+        intercation_path,
+        sep=",",
+        names=["user_id", "item_id", "rating", "timestamp"],
+    )
+    print(f"{prefix} #data points before filter: {ratings.shape[0]}")
+    print(
+        f"{prefix} #user before filter: {len(set(ratings['user_id'].values))}"
+    )
+    print(
+        f"{prefix} #item before filter: {len(set(ratings['item_id'].values))}"
+    )
+
+    # filter users and items with presence < 5
+    item_id_count = (
+        ratings["item_id"]
+        .value_counts()
+        .rename_axis("unique_values")
+        .reset_index(name="item_count")
+    )
+    user_id_count = (
+        ratings["user_id"]
+        .value_counts()
+        .rename_axis("unique_values")
+        .reset_index(name="user_count")
+    )
+    ratings = ratings.join(item_id_count.set_index("unique_values"), on="item_id")
+    ratings = ratings.join(user_id_count.set_index("unique_values"), on="user_id")
+    ratings = ratings[ratings["item_count"] >= 5]
+    ratings = ratings[ratings["user_count"] >= 5]
+    ratings = ratings.groupby('user_id').filter(lambda x: len(x['item_id']) >= 5)
+    print(f"{prefix} #data points after filter: {ratings.shape[0]}")
+
+    print(
+        f"{prefix} #user after filter: {len(set(ratings['user_id'].values))}"
+    )
+    print(
+        f"{prefix} #item ater filter: {len(set(ratings['item_id'].values))}"
+    )
+    ratings = ratings[['item_id', 'user_id', 'timestamp']]
+    ratings.to_csv(output_path, index=False, header=True)
+
+
+def preprocess_item(item_path, output_path, prefix='books'):
+    data = []
+    for line in open(item_path):
+        json_data = eval(line)
+        item_id = json_data.get('asin', '')
+        description = json_data.get('description', '')
+        title = json_data.get('title', '')
+
+        data.append({
+            'item_id': item_id,
+            'description': description,
+            'title': title
+        })
+
+    df = pd.DataFrame(data)
+    df.to_csv(output_path, index=False)
+
+
+if __name__ == '__main__':
+    preprocess_interaction("ratings_Books.csv", "amazon_books.csv")
+    preprocess_item("meta_Books.json", "amazon_books_item.csv")
--- a/reproduce/HLLM-Books-neg28k.sh
+++ b/reproduce/HLLM-Books-neg28k.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# 1B: 128 A100s for ≈ 0.5days
+# 7B: 128 A100s for ≈ 2days
+# For Books, training with a sequence length of 55 and 56*512=28672 negatives in total, serving in a sequence length of 50.
+cd code && python3 main.py \
+--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
+--MAX_ITEM_LIST_LENGTH 55 \
+--epochs 5 \
+--optim_args.learning_rate 1e-4 \
+--checkpoint_dir saved_dir \
+--loss nce \
+--MAX_TEXT_LENGTH 256 \
+--scheduler_args.warmup 0.15 \
+--dataset amazon_books \
+--gradient_checkpointing True \
+--text_keys '[\"title\",\"description\"]' \
+--text_path text_path \
+--item_pretrain_dir item_pretrain_dir \
+--user_pretrain_dir user_pretrain_dir \
+--train_batch_size 4 \
+--data_split False \
+--MAX_ITEM_LIST_LENGTH_TEST 50 \
+--seed 42 \
+--stage 3
\ No newline at end of file
--- a/reproduce/HLLM-Books-neg512.sh
+++ b/reproduce/HLLM-Books-neg512.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# 1B: 16 H100s for ≈ 2days
+cd code && python3 main.py \
+--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
+--MAX_ITEM_LIST_LENGTH 50 \
+--epochs 5 \
+--optim_args.learning_rate 1e-4 \
+--checkpoint_dir saved_dir \
+--loss nce \
+--MAX_TEXT_LENGTH 256 \
+--dataset amazon_books \
+--gradient_checkpointing True \
+--text_keys '[\"title\",\"description\"]' \
+--train_batch_size 8 \
+--text_path text_path \
+--item_pretrain_dir item_pretrain_dir \
+--user_pretrain_dir user_pretrain_dir \
+--num_negatives 512
\ No newline at end of file
--- a/reproduce/HLLM-Pixel.sh
+++ b/reproduce/HLLM-Pixel.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# 1B: 32 A100s for ≈ 4.1days
+cd code && python3 main.py \
+--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
+--MAX_ITEM_LIST_LENGTH 10 --epochs 5 --optim_args.learning_rate 1e-4 \
+--checkpoint_dir saved_dir \
+--loss nce --MAX_TEXT_LENGTH 256 --dataset Pixel8M \
+--text_path text_path \
+--item_pretrain_dir item_pretrain_dir \
+--user_pretrain_dir user_pretrain_dir \
+--train_batch_size 16
\ No newline at end of file
--- a/reproduce/HSTU-1B-Books-neg512.sh
+++ b/reproduce/HSTU-1B-Books-neg512.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# batch_size = 16GPUs * 8 = 128
+cd code && python3 main.py \
+--config_file IDNet/hstu.yaml overall/ID_deepspeed.yaml \
+--optim_args.learning_rate 1e-3 \
+--loss nce \
+--train_batch_size 8 \
+--MAX_ITEM_LIST_LENGTH 50 \
+--epochs 201 \
+--dataset amazon_books \
+--hidden_dropout_prob 0.5 \
+--attn_dropout_prob 0.5 \
+--n_layers 22 \
+--n_heads 32 \
+--item_embedding_size 2048 \
+--hstu_embedding_size 2048 \
+--fix_temp True \
+--num_negatives 512 \
+--show_progress True \
+--update_interval 100 \
+--checkpoint_dir checkpoint_dir \
+--stopping_step 10 
\ No newline at end of file
--- a/reproduce/HSTU-Pixel8M.sh
+++ b/reproduce/HSTU-Pixel8M.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# Use 8GPUs for batch_size = 8x64 = 512
+cd code && python3 main.py \
+--config_file IDNet/hstu.yaml overall/ID.yaml \
+--optim_args.learning_rate 1e-4 \
+--loss nce \
+--train_batch_size 64 \
+--MAX_ITEM_LIST_LENGTH 10 \
+--epochs 50 \
+--dataset Pixel8M \
+--stopping_step 5 \
+--show_progress True \
+--update_interval 100 
\ No newline at end of file
--- a/reproduce/HSTU-large-Books-neg28k.sh
+++ b/reproduce/HSTU-large-Books-neg28k.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# Use 8GPUs for batch_size = 8x64 = 512
+# fixed temperature performs better
+cd code && python3 main.py \
+--config_file IDNet/hstu.yaml overall/ID.yaml \
+--optim_args.learning_rate 1e-3 \
+--loss nce \
+--train_batch_size 16 \
+--MAX_ITEM_LIST_LENGTH 55 \
+--epochs 201 \
+--dataset amazon_books \
+--hidden_dropout_prob 0.5 \
+--attn_dropout_prob 0.5 \
+--n_layers 16 \
+--n_heads 8 \
+--item_embedding_size 64 \
+--hstu_embedding_size 64 \
+--fix_temp True \
+--data_split False \
+--show_progress True \
+--update_interval 100 \
+--optim_args.weight_decay 0.0 \
+--seed 42 \
+--stopping_step 10 
\ No newline at end of file
--- a/reproduce/HSTU-large-Books-neg512.sh
+++ b/reproduce/HSTU-large-Books-neg512.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# Use 8GPUs for batch_size = 8x16 = 128
+cd code && python3 main.py \
+--config_file IDNet/hstu.yaml overall/ID.yaml \
+--optim_args.learning_rate 1e-3 \
+--loss nce \
+--train_batch_size 16 \
+--MAX_ITEM_LIST_LENGTH 50 \
+--epochs 201 \
+--dataset amazon_books \
+--hidden_dropout_prob 0.5 \
+--attn_dropout_prob 0.5 \
+--n_layers 16 \
+--n_heads 8 \
+--item_embedding_size 64 \
+--hstu_embedding_size 64 \
+--num_negatives 512 \
+--show_progress True \
+--update_interval 100 \
+--optim_args.weight_decay 0.0 \
+--stopping_step 10
\ No newline at end of file
--- a/reproduce/SASRec-1B-Books-neg512.sh
+++ b/reproduce/SASRec-1B-Books-neg512.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# batch_size = 16GPUs * 8 = 128
+# flash attn need bf16
+cd code && python3 main.py \
+--config_file IDNet/llama_id.yaml overall/ID_deepspeed.yaml \
+--optim_args.learning_rate 1e-3 \
+--loss nce \
+--train_batch_size 8 \
+--MAX_ITEM_LIST_LENGTH 50 \
+--epochs 201 \
+--dataset amazon_books \
+--num_negatives 512 \
+--item_embed_dim 512 \
+--show_progress True \
+--update_interval 100 \
+--fix_temp True \
+--optim_args.weight_decay 0.1 \
+--user_pretrain_dir user_pretrain_dir \
+--checkpoint_dir checkpoint_dir \
+--stopping_step 10 
\ No newline at end of file
--- a/reproduce/SASRec-Books-neg512.sh
+++ b/reproduce/SASRec-Books-neg512.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# Use 8GPUs for batch_size = 8x16 = 128
+cd code && python3 main.py \
+--config_file IDNet/sasrec.yaml overall/ID.yaml \
+--optim_args.learning_rate 1e-3 \
+--loss nce \
+--train_batch_size 16 \
+--MAX_ITEM_LIST_LENGTH 50 \
+--epochs 201 \
+--dataset amazon_books \
+--hidden_dropout_prob 0.5 \
+--attn_dropout_prob 0.5 \
+--num_negatives 512 \
+--n_layers 4 \
+--n_heads 4 \
+--embedding_size 64 \
+--inner_size 1 \
+--show_progress True \
+--update_interval 100 \
+--optim_args.weight_decay 0.0 \
+--stopping_step 10 
\ No newline at end of file
--- a/reproduce/SASRec-Pixel8M.sh
+++ b/reproduce/SASRec-Pixel8M.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# Use 8GPUs for batch_size = 8x64 = 512
+cd code && python3 main.py \
+--config_file IDNet/sasrec.yaml overall/ID.yaml \
+--optim_args.learning_rate 1e-4 \
+--loss nce \
+--train_batch_size 64 \
+--MAX_ITEM_LIST_LENGTH 10 \
+--epochs 50 \
+--dataset Pixel8M \
+--stopping_step 5 \
+--show_progress True \
+--update_interval 100 
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+setuptools==69.5.1
+pandas==2.2.2
+colorama==0.4.6
+torch_geometric==2.5.3
+accelerate==0.30.1
+numpy==1.26.4
+wandb==0.17.7
+colorlog==6.8.2
+# deepspeed==0.14.2
+lightning==2.4.0
+pytz==2020.5
+PyYAML==6.0.1
+tensorboardX==2.6.2.2
+tensorflow_cpu==2.8.1
+tqdm==4.66.4
+transformers==4.41.1
+fbgemm_gpu==0.5.0
+# flash_attn==2.5.9.post1
+sentencepiece==0.2.0
--- a/train.sh
+++ b/train.sh
+#!/bin/bash
+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
+
+# 1B: 128 A100s for ≈ 0.5days
+# 7B: 128 A100s for ≈ 2days
+# For Books, training with a sequence length of 55 and 56*512=28672 negatives in total, serving in a sequence length of 50.
+cd code && python3 main.py \
+--config_file overall/LLM_deepspeed.yaml HLLM/HLLM.yaml \
+--loss nce \
+--epochs 5 \
+--dataset Pixel200K \
+--train_batch_size 8 \
+--MAX_TEXT_LENGTH 256 \
+--MAX_ITEM_LIST_LENGTH 10 \
+--checkpoint_dir saved_path \
+--optim_args.learning_rate 1e-4 \
+--item_pretrain_dir TinyLlama-1.1B-Chat-v1.0 \
+--user_pretrain_dir TinyLlama-1.1B-Chat-v1.0 \
+--text_path "../information" \
+--text_keys '[\"title\",\"tag\",\"description\"]'