"tests/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "5185c522b0acb798ed8ebb9084510bbe5b58e73b"
Unverified Commit cd0d6f3c authored by zenlytix's avatar zenlytix Committed by GitHub
Browse files

Fixes to support subscriptions with other VMs (#105)

* Update scripts to handle cases where you have other VMs in your sub

* Support subs with other VMs and fix for PDSH permission error

* Minor fix to support subs with other VMs
parent 5897091e
#!/bin/bash #!/bin/bash
config_file=azure_config.json config_file=azure_config.json
if [ ! -f ${config_file} ]; then
echo "Cannot find $config_file"
exit 1
fi
location=`cat ${config_file} | jq .location | sed 's/"//g'`
rg=deepspeed_rg_$location
while getopts 'c:' flag; do while getopts 'c:' flag; do
case "${flag}" in case "${flag}" in
c) config_file="${OPTARG}" ;; c) config_file="${OPTARG}" ;;
...@@ -13,7 +21,7 @@ echo "Using $config_file" ...@@ -13,7 +21,7 @@ echo "Using $config_file"
nodeid=$1 nodeid=$1
cmds=${@:2} cmds=${@:2}
echo $nodeid $cmds echo $nodeid $cmds
ip_addr=`az vm list-ip-addresses | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` ip_addr=`az vm list-ip-addresses -g $rg | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'` ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'`
if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
......
...@@ -5,6 +5,8 @@ if [ ! -f ${azure_config} ]; then ...@@ -5,6 +5,8 @@ if [ ! -f ${azure_config} ]; then
echo "Cannot find $azure_config" echo "Cannot find $azure_config"
exit 1 exit 1
fi fi
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
rg=deepspeed_rg_$location
parallel=true parallel=true
command -v pdsh command -v pdsh
...@@ -34,14 +36,14 @@ if [ $parallel == true ]; then ...@@ -34,14 +36,14 @@ if [ $parallel == true ]; then
echo "parallel docker pull" echo "parallel docker pull"
hosts="" hosts=""
for node_id in {0..1}; do for node_id in {0..1}; do
addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
hosts="${addr},${hosts}" hosts="${addr},${hosts}"
done done
PDSH_SSH_ARGS_APPEND=${args} pdsh -w $hosts -l ${username} $update_script PDSH_RCMD_TYPE=ssh PDSH_SSH_ARGS_APPEND=${args} pdsh -w $hosts -l ${username} $update_script
else else
echo "sequential docker pull" echo "sequential docker pull"
for node_id in `seq 0 $((num_vms - 1))`; do for node_id in `seq 0 $((num_vms - 1))`; do
ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
addr=${username}@${ip_addr} addr=${username}@${ip_addr}
ssh ${args} $addr $update_script ssh ${args} $addr $update_script
done done
......
...@@ -5,6 +5,8 @@ if [ ! -f ${azure_config} ]; then ...@@ -5,6 +5,8 @@ if [ ! -f ${azure_config} ]; then
echo "Cannot find $azure_config" echo "Cannot find $azure_config"
exit 1 exit 1
fi fi
location=`cat ${azure_config} | jq .location | sed 's/"//g'`
rg=deepspeed_rg_$location
ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'` ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
...@@ -14,8 +16,8 @@ if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config' ...@@ -14,8 +16,8 @@ if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'
username=deepspeed username=deepspeed
args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
num_vms=`az vm list | jq '. | length'` num_vms=`az vm list -g $rg | jq '. | length'`
first_ip_addr=`az vm list-ip-addresses | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` first_ip_addr=`az vm list-ip-addresses -g $rg | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'` num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'`
echo "number of slots per vm: $num_slots" echo "number of slots per vm: $num_slots"
...@@ -24,7 +26,7 @@ ssh_config=config ...@@ -24,7 +26,7 @@ ssh_config=config
echo -n "" > $hostfile echo -n "" > $hostfile
echo -n "" > $ssh_config echo -n "" > $ssh_config
for node_id in `seq 0 $((num_vms - 1))`; do for node_id in `seq 0 $((num_vms - 1))`; do
private_ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'` private_ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'`
echo "worker-${node_id} slots=${num_slots}" >> hostfile echo "worker-${node_id} slots=${num_slots}" >> hostfile
echo "Host worker-${node_id} echo "Host worker-${node_id}
HostName ${private_ip_addr} HostName ${private_ip_addr}
...@@ -41,7 +43,7 @@ git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed; ...@@ -41,7 +43,7 @@ git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed;
" "
for node_id in `seq 0 $((num_vms - 1))`; do for node_id in `seq 0 $((num_vms - 1))`; do
ip_addr=`az vm list-ip-addresses | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'` ip_addr=`az vm list-ip-addresses -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
addr=${username}@${ip_addr} addr=${username}@${ip_addr}
echo "copying ssh keys, ssh config, hostfile to worker-${node_id}" echo "copying ssh keys, ssh config, hostfile to worker-${node_id}"
ssh $args ${addr} $update_script ssh $args ${addr} $update_script
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment