import dask.array as da
from distributed import Client, LocalCluster
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

if __name__ == "__main__":
    print("loading data")

    X, y = make_regression(n_samples=1000, n_features=50)

    print("initializing a Dask cluster")

    cluster = LocalCluster(n_workers=2)
    client = Client(cluster)

    print("created a Dask LocalCluster")

    print("distributing training data on the Dask cluster")

    dX = da.from_array(X, chunks=(100, 50))
    dy = da.from_array(y, chunks=(100,))

    print("beginning training")

    dask_model = lgb.DaskLGBMRegressor(n_estimators=10)
    dask_model.fit(dX, dy)
    assert dask_model.fitted_

    print("done training")

    print("predicting on the training data")

    preds = dask_model.predict(dX)

    # the code below uses sklearn.metrics, but this requires pulling all of the
    # predictions and target values back from workers to the client
    #
    # for larger datasets, consider the metrics from dask-ml instead
    # https://ml.dask.org/modules/api.html#dask-ml-metrics-metrics
    print("computing MSE")

    preds_local = preds.compute()
    actuals_local = dy.compute()
    mse = mean_squared_error(actuals_local, preds_local)

    print(f"MSE: {mse}")