start-cluster.yml 2.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
- name: Start docker cluster
  hosts: all
  become: yes
  any_errors_fatal: true
  vars_files:
    - vars.yml
  vars:
    rccl_tests_install_dir: /workspace/rccl-tests

  tasks:
    - name: Run cluster container
      community.docker.docker_container:
        name: "{{ container_name }}"
        image: "{{ image_name }}"
        recreate: "{{ force_rm | default(false) | bool }}"
        state: started
        hostname: "{{ inventory_hostname }}"
        network_mode: host
        ipc_mode: host
        privileged: yes
        shm_size: 512G
        volumes:
          - /opt/hyhal:/opt/hyhal:ro
          - /root/.ssh:/root/.ssh
        working_dir: /workspace
        etc_hosts: "{{ dict(groups['all'] | zip(groups['all'] | map('extract', hostvars, ['ansible_facts', 'default_ipv4', 'address']))) }}"
        command: >
          bash -c "mkdir -p /run/sshd && /usr/sbin/sshd -p {{ ssh_port }}; sleep infinity"

    - name: Check if rccl-tests is already installed
      community.docker.docker_container_exec:
        container: "{{ container_name }}"
        command: test -f /usr/local/bin/all_reduce_perf
      register: rccl_installed
      failed_when: false
      changed_when: false

    - name: Install rccl-tests
      when: rccl_installed.rc != 0 or (force_reinstall | default(false) | bool)
      block:
        - name: Clone rccl-tests
          community.docker.docker_container_exec:
            container: "{{ container_name }}"
            command: >
              bash -c "
                rm -rf {{ rccl_tests_install_dir }} &&
                git clone https://github.com/ROCm/rccl-tests.git -b master {{ rccl_tests_install_dir }}
              "

        - name: Build rccl-tests
          community.docker.docker_container_exec:
            container: "{{ container_name }}"
            command: >
              bash -c "
                cd {{ rccl_tests_install_dir }} &&
                ln -sf $(which hipify-perl) /opt/dtk/bin/hipify-perl &&
                ./install.sh --mpi --mpi_home /opt/mpi \
                  --rocm_home /opt/dtk \
                  --rccl_home /opt/dtk/rccl \
                  --hip_compiler hipcc \
                  --gpu_targets {{ gpu_target }}
              "

        - name: Copy rccl-tests binaries to global PATH
          community.docker.docker_container_exec:
            container: "{{ container_name }}"
            command: >
              bash -c "
                cp {{ rccl_tests_install_dir }}/build/*_perf /usr/local/bin/ &&
                chmod +x /usr/local/bin/*_perf
              "

    - name: Verify rccl-tests installation
      community.docker.docker_container_exec:
        container: "{{ container_name }}"
        command: all_reduce_perf --help
      changed_when: false