Unverified Commit 1f73f559 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[dask] allow tight control over ports (#3994)



* [dask] allow tight control over ports

* getting there, getting there

* fix params maybe

* fixing params

* remove unnecessary stuff

* fix tests

* fixes

* some minor changes

* fix flaky test

* linting

* more linting

* clarify parameter description

* add warning

* revert docs change

* Update python-package/lightgbm/dask.py

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* trying to fix stuff

* this is working

* update tests

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* indent
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent b09c1ff7
......@@ -1139,7 +1139,7 @@ Network Parameters
- this parameter is needed to be set in both **socket** and **mpi** versions
- ``local_listen_port`` :raw-html:`<a id="local_listen_port" title="Permalink to this parameter" href="#local_listen_port">&#x1F517;&#xFE0E;</a>`, default = ``12400``, type = int, aliases: ``local_port``, ``port``, constraints: ``local_listen_port > 0``
- ``local_listen_port`` :raw-html:`<a id="local_listen_port" title="Permalink to this parameter" href="#local_listen_port">&#x1F517;&#xFE0E;</a>`, default = ``12400 (random for Dask-package)``, type = int, aliases: ``local_port``, ``port``, constraints: ``local_listen_port > 0``
- TCP listen port for local machines
......
......@@ -967,6 +967,7 @@ struct Config {
int num_machines = 1;
// check = >0
// default = 12400 (random for Dask-package)
// alias = local_port, port
// desc = TCP listen port for local machines
// desc = **Note**: don't forget to allow this port in firewall settings before training
......
......@@ -114,14 +114,13 @@ try:
from dask.array import Array as dask_Array
from dask.dataframe import DataFrame as dask_DataFrame
from dask.dataframe import Series as dask_Series
from dask.distributed import Client, default_client, get_worker, wait
from dask.distributed import Client, default_client, wait
DASK_INSTALLED = True
except ImportError:
DASK_INSTALLED = False
delayed = None
Client = object
default_client = None
get_worker = None
wait = None
class dask_Array:
......
......@@ -15,10 +15,9 @@ from urllib.parse import urlparse
import numpy as np
import scipy.sparse as ss
from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_warning, _safe_call
from .basic import _LIB, LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning, _safe_call
from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat,
dask_Array, dask_DataFrame, dask_Series, default_client, delayed, get_worker, pd_DataFrame,
pd_Series, wait)
dask_Array, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait)
from .sklearn import LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict
_DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series]
......@@ -140,22 +139,18 @@ def _train_part(
params: Dict[str, Any],
model_factory: Type[LGBMModel],
list_of_parts: List[Dict[str, _DaskPart]],
worker_address_to_port: Dict[str, int],
machines: str,
local_listen_port: int,
num_machines: int,
return_model: bool,
time_out: int = 120,
**kwargs: Any
) -> Optional[LGBMModel]:
local_worker_address = get_worker().address
machine_list = ','.join([
'%s:%d' % (urlparse(worker_address).hostname, port)
for worker_address, port
in worker_address_to_port.items()
])
network_params = {
'machines': machine_list,
'local_listen_port': worker_address_to_port[local_worker_address],
'machines': machines,
'local_listen_port': local_listen_port,
'time_out': time_out,
'num_machines': len(worker_address_to_port)
'num_machines': num_machines
}
params.update(network_params)
......@@ -199,6 +194,38 @@ def _split_to_parts(data: _DaskCollection, is_matrix: bool) -> List[_DaskPart]:
return parts
def _machines_to_worker_map(machines: str, worker_addresses: List[str]) -> Dict[str, int]:
"""Create a worker_map from machines list.
Given ``machines`` and a list of Dask worker addresses, return a mapping where the keys are
``worker_addresses`` and the values are ports from ``machines``.
Parameters
----------
machines : str
A comma-delimited list of workers, of the form ``ip1:port,ip2:port``.
worker_addresses : list of str
A list of Dask worker addresses, of the form ``{protocol}{hostname}:{port}``, where ``port`` is the port Dask's scheduler uses to talk to that worker.
Returns
-------
result : Dict[str, int]
Dictionary where keys are work addresses in the form expected by Dask and values are a port for LightGBM to use.
"""
machine_addresses = machines.split(",")
machine_to_port = defaultdict(set)
for address in machine_addresses:
host, port = address.split(":")
machine_to_port[host].add(int(port))
out = {}
for address in worker_addresses:
worker_host = urlparse(address).hostname
out[address] = machine_to_port[worker_host].pop()
return out
def _train(
client: Client,
data: _DaskMatrixLike,
......@@ -238,13 +265,46 @@ def _train(
-------
model : lightgbm.LGBMClassifier, lightgbm.LGBMRegressor, or lightgbm.LGBMRanker class
Returns fitted underlying model.
Note
----
This method handles setting up the following network parameters based on information
about the Dask cluster referenced by ``client``.
* ``local_listen_port``: port that each LightGBM worker opens a listening socket on,
to accept connections from other workers. This can differ from LightGBM worker
to LightGBM worker, but does not have to.
* ``machines``: a comma-delimited list of all workers in the cluster, in the
form ``ip:port,ip:port``. If running multiple Dask workers on the same host, use different
ports for each worker. For example, for ``LocalCluster(n_workers=3)``, you might
pass ``"127.0.0.1:12400,127.0.0.1:12401,127.0.0.1:12402"``.
* ``num_machines``: number of LightGBM workers.
* ``timeout``: time in minutes to wait before closing unused sockets.
The default behavior of this function is to generate ``machines`` from the list of
Dask workers which hold some piece of the training data, and to search for an open
port on each worker to be used as ``local_listen_port``.
If ``machines`` is provided explicitly in ``params``, this function uses the hosts
and ports in that list directly, and does not do any searching. This means that if
any of the Dask workers are missing from the list or any of those ports are not free
when training starts, training will fail.
If ``local_listen_port`` is provided in ``params`` and ``machines`` is not, this function
constructs ``machines`` from the list of Dask workers which hold some piece of the
training data, assuming that each one will use the same ``local_listen_port``.
"""
params = deepcopy(params)
params = _choose_param_value(
main_param_name="local_listen_port",
params=params,
default_value=12400
# capture whether local_listen_port or its aliases were provided
listen_port_in_params = any(
alias in params for alias in _ConfigAliases.get("local_listen_port")
)
# capture whether machines or its aliases were provided
machines_in_params = any(
alias in params for alias in _ConfigAliases.get("machines")
)
params = _choose_param_value(
......@@ -271,11 +331,12 @@ def _train(
)
# Some passed-in parameters can be removed:
# * 'machines': constructed automatically from Dask worker list
# * 'num_machines': set automatically from Dask worker list
# * 'num_threads': overridden to match nthreads on each Dask process
for param_alias in _ConfigAliases.get('machines', 'num_machines', 'num_threads'):
params.pop(param_alias, None)
for param_alias in _ConfigAliases.get('num_machines', 'num_threads'):
if param_alias in params:
_log_warning(f"Parameter {param_alias} will be ignored.")
params.pop(param_alias)
# Split arrays/dataframes into parts. Arrange parts into dicts to enforce co-locality
data_parts = _split_to_parts(data=data, is_matrix=True)
......@@ -312,14 +373,60 @@ def _train(
master_worker = next(iter(worker_map))
worker_ncores = client.ncores()
# find an open port on each worker. note that multiple workers can run
# on the same machine, so this needs to ensure that each one gets its
# own port
worker_address_to_port = _find_ports_for_workers(
client=client,
worker_addresses=worker_map.keys(),
local_listen_port=params["local_listen_port"]
# resolve aliases for network parameters and pop the result off params.
# these values are added back in calls to `_train_part()`
params = _choose_param_value(
main_param_name="local_listen_port",
params=params,
default_value=12400
)
local_listen_port = params.pop("local_listen_port")
params = _choose_param_value(
main_param_name="machines",
params=params,
default_value=None
)
machines = params.pop("machines")
# figure out network params
worker_addresses = worker_map.keys()
if machines is not None:
_log_info("Using passed-in 'machines' parameter")
worker_address_to_port = _machines_to_worker_map(
machines=machines,
worker_addresses=worker_addresses
)
else:
if listen_port_in_params:
_log_info("Using passed-in 'local_listen_port' for all workers")
unique_hosts = set(urlparse(a).hostname for a in worker_addresses)
if len(unique_hosts) < len(worker_addresses):
msg = (
"'local_listen_port' was provided in Dask training parameters, but at least one "
"machine in the cluster has multiple Dask worker processes running on it. Please omit "
"'local_listen_port' or pass 'machines'."
)
raise LightGBMError(msg)
worker_address_to_port = {
address: local_listen_port
for address in worker_addresses
}
else:
_log_info("Finding random open ports for workers")
worker_address_to_port = _find_ports_for_workers(
client=client,
worker_addresses=worker_map.keys(),
local_listen_port=local_listen_port
)
machines = ','.join([
'%s:%d' % (urlparse(worker_address).hostname, port)
for worker_address, port
in worker_address_to_port.items()
])
num_machines = len(worker_address_to_port)
# Tell each worker to train on the parts that it has locally
futures_classifiers = [
......@@ -328,7 +435,9 @@ def _train(
model_factory=model_factory,
params={**params, 'num_threads': worker_ncores[worker]},
list_of_parts=list_of_parts,
worker_address_to_port=worker_address_to_port,
machines=machines,
local_listen_port=worker_address_to_port[worker],
num_machines=num_machines,
time_out=params.get('time_out', 120),
return_model=(worker == master_worker),
**kwargs
......@@ -338,7 +447,24 @@ def _train(
results = client.gather(futures_classifiers)
results = [v for v in results if v]
return results[0]
model = results[0]
# if network parameters were changed during training, remove them from the
# returned moodel so that they're generated dynamically on every run based
# on the Dask cluster you're connected to and which workers have pieces of
# the training data
if not listen_port_in_params:
for param in _ConfigAliases.get('local_listen_port'):
model._other_params.pop(param, None)
if not machines_in_params:
for param in _ConfigAliases.get('machines'):
model._other_params.pop(param, None)
for param in _ConfigAliases.get('num_machines', 'timeout'):
model._other_params.pop(param, None)
return model
def _predict_part(
......
......@@ -174,6 +174,14 @@ def _accuracy_score(dy_true, dy_pred):
return da.average(dy_true == dy_pred).compute()
def _find_random_open_port() -> int:
"""Find a random open port on localhost"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
port = s.getsockname()[1]
return port
def _pickle(obj, filepath, serializer):
if serializer == 'pickle':
with open(filepath, 'wb') as f:
......@@ -202,7 +210,7 @@ def _unpickle(filepath, serializer):
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('centers', data_centers)
def test_classifier(output, centers, client, listen_port):
def test_classifier(output, centers, client):
X, y, w, dX, dy, dw = _create_data(
objective='classification',
output=output,
......@@ -217,7 +225,6 @@ def test_classifier(output, centers, client, listen_port):
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
local_listen_port=listen_port,
**params
)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
......@@ -270,7 +277,7 @@ def test_classifier(output, centers, client, listen_port):
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('centers', data_centers)
def test_classifier_pred_contrib(output, centers, client, listen_port):
def test_classifier_pred_contrib(output, centers, client):
X, y, w, dX, dy, dw = _create_data(
objective='classification',
output=output,
......@@ -285,7 +292,6 @@ def test_classifier_pred_contrib(output, centers, client, listen_port):
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
local_listen_port=listen_port,
tree_learner='data',
**params
)
......@@ -340,13 +346,12 @@ def test_classifier_pred_contrib(output, centers, client, listen_port):
def test_training_does_not_fail_on_port_conflicts(client):
_, _, _, dX, dy, dw = _create_data('classification', output='array')
lightgbm_default_port = 12400
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('127.0.0.1', 12400))
s.bind(('127.0.0.1', lightgbm_default_port))
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
local_listen_port=12400,
n_estimators=5,
num_leaves=5
)
......@@ -362,7 +367,7 @@ def test_training_does_not_fail_on_port_conflicts(client):
@pytest.mark.parametrize('output', data_output)
def test_regressor(output, client, listen_port):
def test_regressor(output, client):
X, y, w, dX, dy, dw = _create_data(
objective='regression',
output=output
......@@ -376,7 +381,6 @@ def test_regressor(output, client, listen_port):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
local_listen_port=listen_port,
tree='data',
**params
)
......@@ -438,7 +442,7 @@ def test_regressor(output, client, listen_port):
@pytest.mark.parametrize('output', data_output)
def test_regressor_pred_contrib(output, client, listen_port):
def test_regressor_pred_contrib(output, client):
X, y, w, dX, dy, dw = _create_data(
objective='regression',
output=output
......@@ -452,7 +456,6 @@ def test_regressor_pred_contrib(output, client, listen_port):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
local_listen_port=listen_port,
tree_learner='data',
**params
)
......@@ -489,7 +492,7 @@ def test_regressor_pred_contrib(output, client, listen_port):
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('alpha', [.1, .5, .9])
def test_regressor_quantile(output, client, listen_port, alpha):
def test_regressor_quantile(output, client, alpha):
X, y, w, dX, dy, dw = _create_data(
objective='regression',
output=output
......@@ -505,7 +508,6 @@ def test_regressor_quantile(output, client, listen_port, alpha):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
local_listen_port=listen_port,
tree_learner_type='data_parallel',
**params
)
......@@ -539,7 +541,7 @@ def test_regressor_quantile(output, client, listen_port, alpha):
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes])
def test_ranker(output, client, listen_port, group):
def test_ranker(output, client, group):
if output == 'dataframe-with-categorical':
X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
......@@ -575,7 +577,6 @@ def test_ranker(output, client, listen_port, group):
dask_ranker = lgb.DaskLGBMRanker(
client=client,
time_out=5,
local_listen_port=listen_port,
tree_learner_type='data_parallel',
**params
)
......@@ -623,7 +624,7 @@ def test_ranker(output, client, listen_port, group):
@pytest.mark.parametrize('task', tasks)
def test_training_works_if_client_not_provided_or_set_after_construction(task, listen_port, client):
def test_training_works_if_client_not_provided_or_set_after_construction(task, client):
if task == 'ranking':
_, _, _, _, dX, dy, _, dg = _create_ranking_data(
output='array',
......@@ -643,7 +644,6 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, l
params = {
"time_out": 5,
"local_listen_port": listen_port,
"n_estimators": 1,
"num_leaves": 2
}
......@@ -700,7 +700,7 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, l
@pytest.mark.parametrize('serializer', ['pickle', 'joblib', 'cloudpickle'])
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('set_client', [True, False])
def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(serializer, task, set_client, listen_port, tmp_path):
def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(serializer, task, set_client, tmp_path):
with LocalCluster(n_workers=2, threads_per_worker=1) as cluster1:
with Client(cluster1) as client1:
......@@ -743,7 +743,6 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
params = {
"time_out": 5,
"local_listen_port": listen_port,
"n_estimators": 1,
"num_leaves": 2
}
......@@ -915,7 +914,6 @@ def test_warns_and_continues_on_unrecognized_tree_learner(client):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
local_listen_port=1234,
tree_learner='some-nonsense-value',
n_estimators=1,
num_leaves=2
......@@ -935,7 +933,6 @@ def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
local_listen_port=1234,
tree_learner=tree_learner,
n_estimators=1,
num_leaves=2
......@@ -1033,6 +1030,141 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(client, tas
client.close(timeout=CLIENT_CLOSE_TIMEOUT)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
def test_network_params_not_required_but_respected_if_given(client, task, output, listen_port):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == 'ranking':
_, _, _, _, dX, dy, _, dg = _create_ranking_data(
output=output,
group=None,
chunk_size=10,
)
dask_model_factory = lgb.DaskLGBMRanker
else:
_, _, _, dX, dy, _ = _create_data(
objective=task,
output=output,
chunk_size=10,
)
dg = None
if task == 'classification':
dask_model_factory = lgb.DaskLGBMClassifier
elif task == 'regression':
dask_model_factory = lgb.DaskLGBMRegressor
# rebalance data to be sure that each worker has a piece of the data
if output == 'array':
client.rebalance()
# model 1 - no network parameters given
dask_model1 = dask_model_factory(
n_estimators=5,
num_leaves=5,
)
if task == 'ranking':
dask_model1.fit(dX, dy, group=dg)
else:
dask_model1.fit(dX, dy)
assert dask_model1.fitted_
params = dask_model1.get_params()
assert 'local_listen_port' not in params
assert 'machines' not in params
# model 2 - machines given
n_workers = len(client.scheduler_info()['workers'])
open_ports = [_find_random_open_port() for _ in range(n_workers)]
dask_model2 = dask_model_factory(
n_estimators=5,
num_leaves=5,
machines=",".join([
"127.0.0.1:" + str(port)
for port in open_ports
]),
)
if task == 'ranking':
dask_model2.fit(dX, dy, group=dg)
else:
dask_model2.fit(dX, dy)
assert dask_model2.fitted_
params = dask_model2.get_params()
assert 'local_listen_port' not in params
assert 'machines' in params
# model 3 - local_listen_port given
# training should fail because LightGBM will try to use the same
# port for multiple worker processes on the same machine
dask_model3 = dask_model_factory(
n_estimators=5,
num_leaves=5,
local_listen_port=listen_port
)
error_msg = "has multiple Dask worker processes running on it"
with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
if task == 'ranking':
dask_model3.fit(dX, dy, group=dg)
else:
dask_model3.fit(dX, dy)
client.close(timeout=CLIENT_CLOSE_TIMEOUT)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
def test_machines_should_be_used_if_provided(task, output):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
with LocalCluster(n_workers=2) as cluster, Client(cluster) as client:
if task == 'ranking':
_, _, _, _, dX, dy, _, dg = _create_ranking_data(
output=output,
group=None,
chunk_size=10,
)
dask_model_factory = lgb.DaskLGBMRanker
else:
_, _, _, dX, dy, _ = _create_data(
objective=task,
output=output,
chunk_size=10,
)
dg = None
if task == 'classification':
dask_model_factory = lgb.DaskLGBMClassifier
elif task == 'regression':
dask_model_factory = lgb.DaskLGBMRegressor
# rebalance data to be sure that each worker has a piece of the data
if output == 'array':
client.rebalance()
n_workers = len(client.scheduler_info()['workers'])
open_ports = [_find_random_open_port() for _ in range(n_workers)]
dask_model = dask_model_factory(
n_estimators=5,
num_leaves=5,
machines=",".join([
"127.0.0.1:" + str(port)
for port in open_ports
]),
)
# test that "machines" is actually respected by creating a socket that uses
# one of the ports mentioned in "machines"
error_msg = "Binding port %s failed" % open_ports[0]
with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('127.0.0.1', open_ports[0]))
if task == 'ranking':
dask_model.fit(dX, dy, group=dg)
else:
dask_model.fit(dX, dy)
@pytest.mark.parametrize(
"classes",
[
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment