[python-package] Add support for NumPy 2.0, test against nightly versions of...

[python-package] Add support for NumPy 2.0, test against nightly versions of dependencies (fixes #6454) (#6467)

[python-package] Add support for NumPy 2.0, test against nightly versions of...
[python-package] Add support for NumPy 2.0, test against nightly versions of dependencies (fixes #6454) (#6467)
1e7ebc51 · James Lamb · GitHub · 63926827 · 1e7ebc51 · 1e7ebc51
Unverified Commit 1e7ebc51 authored Jun 12, 2024 by James Lamb Committed by GitHub Jun 12, 2024
7 changed files
--- a/.ci/test-python-latest.sh
+++ b/.ci/test-python-latest.sh
+#!/bin/bash
+
+set -e -E -u -o pipefail
+
+# latest versions of lightgbm's dependencies,
+# including pre-releases and nightlies
+#
+# ref: https://github.com/pydata/xarray/blob/31111b3afe44fd6f7dac363264e94186cc5168d2/.github/workflows/upstream-dev-ci.yaml
+echo "installing testing dependencies"
+python -m pip install \
+    cloudpickle \
+    psutil \
+    pytest
+echo "done installing testing dependencies"
+
+echo "installing lightgbm's dependencies"
+python -m pip install \
+    --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
+    --prefer-binary \
+    --pre \
+    --upgrade \
+        'numpy>=2.0.0.dev0' \
+        'matplotlib>=3.10.0.dev0' \
+        'pandas>=3.0.0.dev0' \
+        'scikit-learn>=1.6.dev0' \
+        'scipy>=1.15.0.dev0'
+
+python -m pip install \
+    --extra-index-url https://pypi.fury.io/arrow-nightlies/ \
+    --prefer-binary \
+    --pre \
+    --upgrade \
+        'pyarrow>=17.0.0.dev0'
+
+python -m pip install \
+    'cffi>=1.15.1'
+
+echo "done installing lightgbm's dependencies"
+
+echo "installing lightgbm"
+pip install --no-deps dist/*.whl
+echo "done installing lightgbm"
+
+echo "installed package versions:"
+pip freeze
+
+echo ""
+echo "running tests"
+pytest tests/c_api_test/
+pytest tests/python_package_test/
--- a/.github/workflows/python_package.yml
+++ b/.github/workflows/python_package.yml
@@ -75,6 +75,33 @@ jobs:
          export PATH=${CONDA}/bin:${PATH}
          $GITHUB_WORKSPACE/.ci/setup.sh || exit 1
          $GITHUB_WORKSPACE/.ci/test.sh || exit 1
+  test-latest-versions:
+    name: Python - latest versions (ubuntu-latest)
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 5
+          submodules: true
+      - name: Create wheel
+        run: |
+          docker run \
+            --rm \
+            --env CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
+            -v $(pwd):/opt/lgb-build \
+            -w /opt/lgb-build \
+            lightgbm/vsts-agent:manylinux_2_28_x86_64 \
+            /bin/bash -c 'PATH=/opt/miniforge/bin:$PATH sh ./build-python.sh bdist_wheel --nomp'
+      - name: Test compatibility
+        run: |
+          docker run \
+            --rm \
+            -v $(pwd):/opt/lgb-build \
+            -w /opt/lgb-build \
+            python:3.11 \
+            /bin/bash ./.ci/test-python-latest.sh
  test-oldest-versions:
    name: Python - oldest supported versions (ubuntu-latest)
    runs-on: ubuntu-latest
@@ -89,6 +116,7 @@ jobs:
        run: |
          docker run \
            --rm \
+            --env CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
            -v $(pwd):/opt/lgb-build \
            -w /opt/lgb-build \
            lightgbm/vsts-agent:manylinux_2_28_x86_64 \
@@ -104,7 +132,7 @@ jobs:
  all-python-package-jobs-successful:
    if: always()
    runs-on: ubuntu-latest
-    needs: [test, test-oldest-versions]
+    needs: [test, test-latest-versions, test-oldest-versions]
    steps:
    - name: Note that all tests succeeded
      uses: re-actors/alls-green@v1.2.2

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -356,10 +356,10 @@ def _list_to_1d_numpy(
        array = data.ravel()
        return _cast_numpy_array_to_dtype(array, dtype)
    elif _is_1d_list(data):
-        return np.array(data, dtype=dtype, copy=False)
+        return np.asarray(data, dtype=dtype)
    elif isinstance(data, pd_Series):
        _check_for_bad_pandas_dtypes(data.to_frame().dtypes)
-        return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
+        return np.asarray(data, dtype=dtype)  # SparseArray should be supported as well
    else:
        raise TypeError(
            f"Wrong type({type(data).__name__}) for {name}.\n" "It should be list, numpy 1-D array or pandas Series"
@@ -728,7 +728,7 @@ def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray:
 def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray]:
    """Get pointer of float numpy array / list."""
    if _is_1d_list(data):
-        data = np.array(data, copy=False)
+        data = np.asarray(data)
    if _is_numpy_1d_array(data):
        data = _convert_from_sliced_object(data)
        assert data.flags.c_contiguous
@@ -749,7 +749,7 @@ def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray
 def _c_int_array(data: np.ndarray) -> Tuple[_ctypes_int_ptr, int, np.ndarray]:
    """Get pointer of int numpy array / list."""
    if _is_1d_list(data):
-        data = np.array(data, copy=False)
+        data = np.asarray(data)
    if _is_numpy_1d_array(data):
        data = _convert_from_sliced_object(data)
        assert data.flags.c_contiguous
@@ -1270,7 +1270,7 @@ class _InnerPredictor:
        preds: Optional[np.ndarray],
    ) -> Tuple[np.ndarray, int]:
        if mat.dtype == np.float32 or mat.dtype == np.float64:
-            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+            data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
        else:  # change non-float data to float data, need to copy
            data = np.array(mat.reshape(mat.size), dtype=np.float32)
        ptr_data, type_ptr_data, _ = _c_float_array(data)
@@ -2285,9 +2285,9 @@ class Dataset:

        self._handle = ctypes.c_void_p()
        if mat.dtype == np.float32 or mat.dtype == np.float64:
-            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+            data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
        else:  # change non-float data to float data, need to copy
-            data = np.array(mat.reshape(mat.size), dtype=np.float32)
+            data = np.asarray(mat.reshape(mat.size), dtype=np.float32)

        ptr_data, type_ptr_data, _ = _c_float_array(data)
        _safe_call(
@@ -2332,7 +2332,7 @@ class Dataset:
            nrow[i] = mat.shape[0]

            if mat.dtype == np.float32 or mat.dtype == np.float64:
-                mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+                mats[i] = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
            else:  # change non-float data to float data, need to copy
                mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)


--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -512,7 +512,7 @@ def _make_n_folds(
        if hasattr(folds, "split"):
            group_info = full_data.get_group()
            if group_info is not None:
-                group_info = np.array(group_info, dtype=np.int32, copy=False)
+                group_info = np.asarray(group_info, dtype=np.int32)
                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
            else:
                flatted_group = np.zeros(num_data, dtype=np.int32)
@@ -526,7 +526,7 @@ def _make_n_folds(
            if not SKLEARN_INSTALLED:
                raise LightGBMError("scikit-learn is required for ranking cv")
            # ranking task, split according to groups
-            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
+            group_info = np.asarray(full_data.get_group(), dtype=np.int32)
            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
            folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)

--- a/tests/c_api_test/test_.py
+++ b/tests/c_api_test/test_.py
@@ -125,7 +125,7 @@ def load_from_mat(filename, reference):
    mat = np.loadtxt(str(filename), dtype=np.float64)
    label = mat[:, 0].astype(np.float32)
    mat = mat[:, 1:]
-    data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
+    data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
    handle = ctypes.c_void_p()
    ref = None
    if reference is not None:
@@ -203,7 +203,7 @@ def test_booster():
    mat = data[:, 1:]
    preb = np.empty(mat.shape[0], dtype=np.float64)
    num_preb = ctypes.c_int64(0)
-    data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
+    data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
    LIB.LGBM_BoosterPredictForMat(
        booster2,
        data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),

--- a/tests/python_package_test/test_arrow.py
+++ b/tests/python_package_test/test_arrow.py
@@ -20,6 +20,10 @@ if os.getenv("ALLOW_SKIP_ARROW_TESTS") == "1":
 else:
    import pyarrow as pa  # type: ignore

+    assert (
+        lgb.compat.PYARROW_INSTALLED is True
+    ), "'pyarrow' and its dependencies must be installed to run the arrow tests"
+
 # ----------------------------------------------------------------------------------------------- #
 #                                            UTILITIES                                            #
 # ----------------------------------------------------------------------------------------------- #

--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -777,7 +777,10 @@ def test_custom_objective_safety(rng):
 def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name, rng):
    pd = pytest.importorskip("pandas")
    X = rng.uniform(size=(10, 2)).astype(dtype)
-    df = pd.DataFrame(X)
+    # copy=False is necessary because starting with pandas 3.0, pd.DataFrame() creates
+    # a copy of the input numpy array by default
+    # ref: https://github.com/pandas-dev/pandas/issues/58913
+    df = pd.DataFrame(X, copy=False)
    built_data = lgb.basic._data_from_pandas(
        data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
    )[0]