LlamaNcclGuard.h 2.71 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/fastertransformer/utils/nccl_utils.h"
#include <array>
#include <atomic>
#include <condition_variable>
#include <cuda_runtime.h>
#include <mutex>

namespace fastertransformer {

struct NcclGuard {
    static constexpr int kMaxGroupCount = 32;

    static std::mutex& globalNcclMutex()
    {
        static std::mutex inst;
        return inst;
    }

    struct GroupState {
        std::mutex              mutex;
        std::condition_variable cv;
        int                     ref_count;
    };

    static GroupState& groupState(int group_id)
    {
        static std::array<GroupState, kMaxGroupCount> array{};
        FT_CHECK(group_id < kMaxGroupCount);
        return array[group_id];
    }

    NcclGuard(NcclParam tensor_para, cudaStream_t stream, bool barrier = false):
        tensor_para_(tensor_para), stream_(stream), barrier_(barrier)
    {
        if (is_active()) {
            auto& group = groupState(tensor_para_.group_id_);
            if (tensor_para_.rank_ == 0) {
                /// TODO: use std::optional after switching to C++17
                global_nccl_lock_ = std::make_unique<std::lock_guard<std::mutex>>(globalNcclMutex());
                {
                    std::lock_guard<std::mutex> lock(group.mutex);
                    group.ref_count = tensor_para_.world_size_;
                }
                group.cv.notify_all();
            }
            else {
                std::unique_lock<std::mutex> lock(group.mutex);
                group.cv.wait(lock, [&] { return group.ref_count > 0; });
            }
        }
    }

    ~NcclGuard()
    {
        if (is_active()) {
            ftNcclStreamSynchronize(tensor_para_, NcclParam{}, stream_);

            auto& group = groupState(tensor_para_.group_id_);

            int value = -1;
            {
                std::lock_guard<std::mutex> lock(group.mutex);
                value = --group.ref_count;
            }
            if (value == 0) {
                group.cv.notify_all();
            }
            else if (barrier_ || tensor_para_.rank_ == 0) {
                std::unique_lock<std::mutex> lock(group.mutex);
                group.cv.wait(lock, [&] { return group.ref_count == 0; });
            }

            // rank 0 unlocks global NCCL mutex automatically
        }
    }

    bool is_active()
    {
        return barrier_ || (ftNcclGroupCount() > 1 && tensor_para_.world_size_ > 1);
    }

    NcclParam                                    tensor_para_;
    cudaStream_t                                 stream_;
    bool                                         barrier_;
    std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
};

AllentDan's avatar
AllentDan committed
92
}  // namespace fastertransformer