TestBed.hpp 7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/*************************************************************************
 * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#pragma once
#include <map>
#include "CollectiveArgs.hpp"
#include "TestBedChild.hpp"
#include "EnvVars.hpp"
#include <gtest/gtest.h>

namespace RcclUnitTesting
{
  // This class facilitates testing RCCL collectives across various process / device configurations
  //
  class TestBed
  {
  public:
    int                        numDevicesAvailable;   // # of devices detected on node
    std::vector<TestBedChild*> childList;             // List of child processes
    std::vector<int>           rankToChildMap;        // Tracks which child process each rank is assigned to
    std::vector<int>           rankToDeviceMap;       // Tracks which device each rank is assigned to
    int                        numActiveChildren;     // List of active children (with usable RCCL comms)
    int                        numActiveRanks;        // Current # of ranks in use
    int                        numCollectivesInGroup; // # of collectives to execute per group call
    bool                       useBlocking;           // RCCL communication with blocking or non-blocking option
    int                        numStreamsPerGroup;    // # of different streams available per group call
    EnvVars                    ev;                    // Environment variables

    // Constructor - Creates one child process per detected GPU device that waits for further commands
    TestBed();

    // Prepare TestBed for use with GPUs across multiple child processes
    void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild,
                   int  const numCollectivesInGroup = 1,
                   bool const useBlocking           = true,
                   int  const numStreamsPerGroup    = 1);

    // Prepare TestBed for use with GPUs on a single child process
    void InitComms(int  const numGpus,
                   int  const numCollectivesInGroup = 1,
                   bool const useBlocking           = true,
                   int  const numStreamsPerGroup    = 1);

    // Set collectives arguments for specified collective / rank
    // Setting scalarsPerRank to non-null will create custom reduction operator
    // Using collId = -1 (default) applies settings to all collectives in group
    // Using rank = -1 (default) applies settings to all ranks
    void SetCollectiveArgs(ncclFunc_t      const funcType,
                           ncclDataType_t  const dataType,
                           size_t          const numInputElements,
                           size_t          const numOutputElements,
                           OptionalColArgs const &optionalArgs = {},
                           int             const collId        = -1,
                           int             const rank          = -1,
                           int             const streamIdx     = 0);

    // Allocate memory for specified collective / rank
    // - Requires SetCollectiveArgs to have been called already
    // Using collId = -1 (default) applies settings to all collectives in group
    // Using rank = -1 (default) applies settings to all ranks
    void AllocateMem(bool   const inPlace = false,
                     bool   const useManagedMem = false,
                     int    const collId = -1,
                     int    const rank = -1);

    // Initialize input and compute expected results
    // - requires that SetCollectiveArgs and AllocateMemory have already been called
    // Setting collId to -1 applies settings to all collectives in group
    // Setting rank to -1 applies settings to all ranks
    // Setting prepDataFunc to nullptr uses the default fill pattern routine
    void PrepareData(int const collId = -1,
                     int const rank = -1,
                     CollFuncPtr const prepDataFunc = nullptr);

    // Execute all collectives on all test children
    // Blocks until collective is completed
    void ExecuteCollectives(std::vector<int> const &currentRanks = {}, bool const useHipGraph = false);

    // Perform results validation - compare output to expected
    void ValidateResults(bool& isCorrect, int collId = -1, int const rank = -1);

    // Release allocated memory
    void DeallocateMem(int collId = -1, int const rank = -1);

    // Release the RCCL comms
    void DestroyComms();

    // Explicit TestBed destructor that releases all child processes
    // No further calls to TestBed should be performed after this call
    void Finalize();

    // Destructor - Calls Finalize() to release all child processes
    ~TestBed();

    // Returns all the supported reduction operations based on build settings
    std::vector<ncclRedOp_t> const& GetAllSupportedRedOps();

    // Return all the supported data types based on build settings
    std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();

    // Helper function that splits up GPUs to the given number of processes
    static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
                                                          int const numGpus,
                                                          int const ranksPerGpu);
    static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
                                                          int const numGpus);

    // Generate a test case name
    static std::string GetTestCaseName(int            const totalRanks,
                                       bool           const isMultiProcess,
                                       ncclFunc_t     const funcType,
                                       ncclDataType_t const dataType,
                                       ncclRedOp_t    const redOp,
                                       int            const root,
                                       bool           const inPlace,
                                       bool           const managedMem,
                                       bool           const useHipGraph,
                                       int            const ranksPerProc=1);

    // Run a simple sweep
    void RunSimpleSweep(std::vector<ncclFunc_t>     const& funcTypes,
                        std::vector<ncclDataType_t> const& dataTypes,
                        std::vector<ncclRedOp_t>    const& redOps,
                        std::vector<int>            const& roots,
                        std::vector<int>            const& numElements,
                        std::vector<bool>           const& inPlaceList,
                        std::vector<bool>           const& managedMemList,
                        std::vector<bool>           const& useHipGraphList);

    // Wait for user-input if in interactive mode
    void InteractiveWait(std::string message);

    // Used to track total number of calls to ExecuteCollectives()
    static int& NumTestsRun();

  protected:
    // Ends the specified child process
    void StopChild(int const childId);
  };
}