TestBedChild.hpp 3.48 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*************************************************************************
 * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#pragma once

#include <vector>
#include <unistd.h>
#include "CollectiveArgs.hpp"
#include "rccl/rccl.h"

#define MAX_RANKS 32
#define MAX_LOOP_COUNTER 400000000000
namespace RcclUnitTesting
{
  class TestBedChild
  {
  public:
    // These are commands that can be given to the child process
    enum
    {
      CHILD_GET_UNIQUE_ID    = 0,  // GetUniqueId()
      CHILD_INIT_COMMS       = 1,  // InitComms()
      CHILD_SET_COLL_ARGS    = 2,  // SetCollectiveArgs()
      CHILD_ALLOCATE_MEM     = 3,  // AllocateMem()
      CHILD_PREPARE_DATA     = 4,  // PrepareData()
      CHILD_EXECUTE_COLL     = 5,  // ExecuteCollectives()
      CHILD_VALIDATE_RESULTS = 6,  // ValidateResults()
      CHILD_DEALLOCATE_MEM   = 7,  // DeallocateMem()
      CHILD_DESTROY_COMMS    = 8,  // DestroyComms()
      CHILD_STOP             = 9,  // Stop()
      NUM_CHILD_COMMANDS     = 10
    };

    char const ChildCommandNames[NUM_CHILD_COMMANDS][20] =
    {
      "GET_UNIQUE_ID",
      "INIT_COMMS",
      "SET_COLL_ARGS",
      "ALLOCATE_MEM",
      "PREPARE_DATA",
      "EXECUTE_COLL",
      "VALIDATE_RESULTS",
      "DEALLOCATE_MEM",
      "DESTROY_COMMS",
      "STOP"
    };

    // These variables remain constant for life of TestBedChild
    int   childId;
    pid_t pid;
    bool  verbose;
    int   printValues;

    // Pipes used to communicate between parent process
    int parentWriteFd;
    int parentReadFd;
    int childWriteFd;
    int childReadFd;

    // These varibles may change based on commands issued by parent
    int totalRanks;                                     // Total ranks
    int rankOffset;                                     // Global rank offset for this child
    int numCollectivesInGroup;                          // # of collectives to run per group call
    bool useBlocking;                                   // RCCL communication with blocking or non-blocking option
    int numStreamsPerGroup;                             // # of different streams allowed per group call
    std::vector<ncclComm_t> comms;                      // RCCL communicators for each rank
    std::vector<int> deviceIds;                         // Device IDs for each rank
    std::vector<std::vector<hipStream_t>> streams;      // Streams for executing collectives
    std::vector<std::vector<CollectiveArgs>> collArgs;  // Info for each collective for each rank

    // Constructor
    TestBedChild(int const childId, bool const verbose, int const printValues);

    // Prepare parent/child communication pipes - to be executed by parent process
    int InitPipes();

    // Execution
    void StartExecutionLoop();

  protected:
    // Calls ncclGetUniqueId and returns it to parent
    ErrCode GetUniqueId();

    // Initialize RCCL communicators
    ErrCode InitComms();

    // Set CollectiveArgs
    ErrCode SetCollectiveArgs();

    // Allocate memory (input (GPU) / output (GPU) / expected (CPU))
    ErrCode AllocateMem();

    // Prepare input and expected data
    ErrCode PrepareData();

    // Execute a group of collectives
    ErrCode ExecuteCollectives();

    // Validate that output matches expected
    ErrCode ValidateResults();

    // Release allocated memory
    ErrCode DeallocateMem();

    // Destroys RCCL communicators
    ErrCode DestroyComms();
  };
}