- name: Start docker cluster hosts: all become: yes any_errors_fatal: true vars_files: - vars.yml vars: rccl_tests_install_dir: /workspace/rccl-tests tasks: - name: Run cluster container community.docker.docker_container: name: "{{ container_name }}" image: "{{ image_name }}" recreate: "{{ force_rm | default(false) | bool }}" state: started hostname: "{{ inventory_hostname }}" network_mode: host ipc_mode: host privileged: yes shm_size: 512G volumes: - /opt/hyhal:/opt/hyhal:ro - /root/.ssh:/root/.ssh working_dir: /workspace etc_hosts: "{{ dict(groups['all'] | zip(groups['all'] | map('extract', hostvars, ['ansible_facts', 'default_ipv4', 'address']))) }}" command: > bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p {{ ssh_port }}; sleep infinity" - name: Check if rccl-tests is already installed community.docker.docker_container_exec: container: "{{ container_name }}" command: test -f /usr/local/bin/all_reduce_perf register: rccl_installed failed_when: false changed_when: false - name: Install rccl-tests when: rccl_installed.rc != 0 or (force_reinstall | default(false) | bool) block: - name: Clone rccl-tests community.docker.docker_container_exec: container: "{{ container_name }}" command: > bash -c " rm -rf {{ rccl_tests_install_dir }} && git clone https://github.com/ROCm/rccl-tests.git -b master {{ rccl_tests_install_dir }} " - name: Build rccl-tests community.docker.docker_container_exec: container: "{{ container_name }}" command: > bash -c " cd {{ rccl_tests_install_dir }} && ln -sf $(which hipify-perl) /opt/dtk/bin/hipify-perl && ./install.sh --mpi --mpi_home /opt/mpi \ --rocm_home /opt/dtk \ --rccl_home /opt/dtk/rccl \ --hip_compiler hipcc \ --gpu_targets {{ gpu_target }} " - name: Copy rccl-tests binaries to global PATH community.docker.docker_container_exec: container: "{{ container_name }}" command: > bash -c " cp {{ rccl_tests_install_dir }}/build/*_perf /usr/local/bin/ && chmod +x /usr/local/bin/*_perf " - name: Verify rccl-tests installation community.docker.docker_container_exec: container: "{{ container_name }}" command: all_reduce_perf --help changed_when: false