dataset_from_multi_hdf5.py

from pathlib import Path

import h5py
import numpy as np
import pandas as pd

import lightgbm as lgb


class HDFSequence(lgb.Sequence):
    def __init__(self, hdf_dataset, batch_size):
        """
        Construct a sequence object from HDF5 with required interface.

        Parameters
        ----------
        hdf_dataset : h5py.Dataset
            Dataset in HDF5 file.
        batch_size : int
            Size of a batch. When reading data to construct lightgbm Dataset, each read reads batch_size rows.
        """
        # We can also open HDF5 file once and get access to
        self.data = hdf_dataset
        self.batch_size = batch_size

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)


def create_dataset_from_multiple_hdf(input_flist, batch_size):
    data = []
    ylist = []
    for f in input_flist:
        f = h5py.File(f, "r")
        data.append(HDFSequence(f["X"], batch_size))
        ylist.append(f["Y"][:])

    params = {
        "bin_construct_sample_cnt": 200000,
        "max_bin": 255,
    }
    y = np.concatenate(ylist)
    dataset = lgb.Dataset(data, label=y, params=params)
    # With binary dataset created, we can use either Python API or cmdline version to train.
    #
    # Note: in order to create exactly the same dataset with the one created in simple_example.py, we need
    # to modify simple_example.py to pass numpy array instead of pandas DataFrame to Dataset constructor.
    # The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
    # as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
    # are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
    dataset.save_binary("regression.train.from_hdf.bin")


def save2hdf(input_data, fname, batch_size):
    """Store numpy array to HDF5 file.

    Please note chunk size settings in the implementation for I/O performance optimization.
    """
    with h5py.File(fname, "w") as f:
        for name, data in input_data.items():
            nrow, ncol = data.shape
            if ncol == 1:
                # Y has a single column and we read it in single shot. So store it as an 1-d array.
                chunk = (nrow,)
                data = data.values.flatten()
            else:
                # We use random access for data sampling when creating LightGBM Dataset from Sequence.
                # When accessing any element in a HDF5 chunk, it's read entirely.
                # To save I/O for sampling, we should keep number of total chunks much larger than sample count.
                # Here we are just creating a chunk size that matches with batch_size.
                #
                # Also note that the data is stored in row major order to avoid extra copy when passing to
                # lightgbm Dataset.
                chunk = (batch_size, ncol)
            f.create_dataset(name, data=data, chunks=chunk, compression="lzf")


def generate_hdf(input_fname, output_basename, batch_size):
    # Save to 2 HDF5 files for demonstration.
    df = pd.read_csv(input_fname, header=None, sep="\t")

    mid = len(df) // 2
    df1 = df.iloc[:mid]
    df2 = df.iloc[mid:]

    # We can store multiple datasets inside a single HDF5 file.
    # Separating X and Y for choosing best chunk size for data loading.
    fname1 = f"{output_basename}1.h5"
    fname2 = f"{output_basename}2.h5"
    save2hdf({"Y": df1.iloc[:, :1], "X": df1.iloc[:, 1:]}, fname1, batch_size)
    save2hdf({"Y": df2.iloc[:, :1], "X": df2.iloc[:, 1:]}, fname2, batch_size)

    return [fname1, fname2]


def main():
    batch_size = 64
    output_basename = "regression"
    hdf_files = generate_hdf(
        str(Path(__file__).absolute().parents[1] / "regression" / "regression.train"), output_basename, batch_size
    )

    create_dataset_from_multiple_hdf(hdf_files, batch_size=batch_size)


if __name__ == "__main__":
    main()