prepare_datasets.py

import argparse
import datetime
import glob
import numpy as np
import os.path
import pandas as pd
from allamo.logging import configure_logger, logger
import torch

EOS_TOKEN = "</s>"

def init_tokenizer(output_dir, tiktoken_tokenizer_name, hf_tokenizer_path):
    vocab_size = None
    if hf_tokenizer_path:
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
        if vocab_size is None:
            vocab_size = len(tokenizer)
        logger.info(f"HuggingFace {hf_tokenizer_path} tokenizer loaded with the vocab size {vocab_size}")
    elif tiktoken_tokenizer_name:
        import tiktoken
        tokenizer = tiktoken.get_encoding(tiktoken_tokenizer_name)
        if vocab_size is None:
            vocab_size = tokenizer.max_token_value + 1 # values start from 0
        logger.info(f"Tiktoken {tiktoken_tokenizer_name} tokenizer loaded with the vocab size {vocab_size}")
    else:
        raise Exception('Tokenizer is not provided. Please specify either a Tiktoken tokenizer or a HuggingFace tokenizer')
    return tokenizer

def load_list_of_txt_files(index_file_path, input_data_dir, data_split):
    if index_file_path:
        # Load the csv file into a pandas dataframe
        index_df = pd.read_csv(index_file_path)

        # Replace any NaN values in the "File" column with an empty string
        index_df['File'] = index_df['File'].fillna('')

        # Filter the dataframe to only include rows where the "File" column ends with ".txt"
        txt_files_df = index_df[(index_df['File'].str.endswith('.txt'))]
        if 'Split' not in txt_files_df.columns:
            txt_files_df['Split'] = data_split if data_split else 'train'
    elif input_data_dir:
        txt_files = glob.glob(os.path.join(input_data_dir, "*.txt"))
        txt_files_df = pd.DataFrame({'File': txt_files})
        txt_files_df['Split'] = data_split if data_split else 'train'
    else:
        raise Exception('Either an index file or an input data dir must be provided')
    
    logger.info(f"{len(txt_files_df)} txt files found to process")
    return txt_files_df
    

def encode_file(input_file, output_file, tokenizer):
    enc_data = tokenizer.encode(input_file.read())
    enc_data = np.array(enc_data, dtype=np.uint16)
    enc_data.tofile(output_file)
    # torch.save(enc_data, output_file)
    tokens = len(enc_data)
    
    enc_data = tokenizer.encode(EOS_TOKEN)
    enc_data = np.array(enc_data, dtype=np.uint16)
    enc_data.tofile(output_file)
    # torch.save(enc_data, output_file)
    tokens += len(enc_data)
    
    return tokens


def create_datasets(txt_files_df, tokenizer, input_data_dir, output_data_dir):
    train_tokens = 0
    val_tokens = 0
    files_cnt = 0
    train_file_path = os.path.join(output_data_dir, 'train.bin')
    val_file_path = os.path.join(output_data_dir, 'val.bin')
    # train_file_path = os.path.join(output_data_dir, 'train.pt')
    # val_file_path = os.path.join(output_data_dir, 'val.pt')
    with open(train_file_path, 'wb+') as train_file, open(val_file_path, 'wb+') as val_file:
        # Process each of the txt files
        for _, row in txt_files_df.iterrows():
            filename = os.path.join(input_data_dir, row['File']) if input_data_dir else row['File']
            if not os.path.isfile(filename):
                logger.info(f"File {filename} does not exist.")
                continue
            with open(filename, 'r', encoding="utf-8") as txt_file:
                logger.info(f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - Start processing {filename}")
                if row['Split'] == 'test':
                    tokens = encode_file(txt_file, val_file, tokenizer)
                    val_tokens += tokens
                else:
                    tokens = encode_file(txt_file, train_file, tokenizer)
                    train_tokens += tokens
                logger.info(f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {filename} added ({tokens} tokens) to the {row['Split']} dataset")
                files_cnt += 1
                    
    total_tokens = train_tokens + val_tokens
    logger.info(f"Datasets created in {output_data_dir} from {files_cnt} files. Tokens: {total_tokens:,} (Train: {train_tokens:,} Val: {val_tokens:,})")

if __name__ == '__main__':
    configure_logger()
    parser = argparse.ArgumentParser(description='Prepare your datasets')
    parser.add_argument('--index_file', type=str, help='Path to an index file')
    parser.add_argument('--input_data_dir', type=str, help='Path to a directory with txt files')
    parser.add_argument('--data_split', type=str, default='train', choices=['train', 'test'], help='Data split')
    parser.add_argument('--output_data_dir', type=str, required=True, help='Path to a directory for output dataset files')
    parser.add_argument('--tiktoken_tokenizer_name', type=str, help='Tiktoken tokenizer name')
    parser.add_argument('--hf_tokenizer_path', type=str, help='HuggingFace tokenizer path')
    
    args = parser.parse_args()
    tokenizer = init_tokenizer(args.output_data_dir, args.tiktoken_tokenizer_name, args.hf_tokenizer_path)
    txt_files_df = load_list_of_txt_files(args.index_file, args.input_data_dir, args.data_split)
    create_datasets(txt_files_df, tokenizer, args.input_data_dir, args.output_data_dir)